In [4]:
from torch.utils.data import Dataset
import pandas as pd
import os
import datetime as dt
import numpy as np

In [32]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [89]:
root_dir = '/home/jayeon/Documents/code/Hemodialysis/data'
files = ['Hemodialysis1_1007.csv', 'Hemodialysis2_1007.csv']

In [2]:
def refine_dataset(hemodialysis_frame):
    hemodialysis_frame['Pt_id'] = [x.replace('*', '') for x in hemodialysis_frame['Pt_id']]
    hemodialysis_frame['Pt_id'] = hemodialysis_frame['Pt_id'].astype(str).astype(int)
    hemodialysis_frame['ID_hd'] = [x.replace('*', '') for x in hemodialysis_frame['ID_hd']]
    hemodialysis_frame['ID_hd'] = hemodialysis_frame['ID_hd'].astype(str).astype(int)
    categorical = ['HD_type', 'HD_acces', 'HD_prim', 'HD_dialysate', 'HD_dialyzer']
    drop_columns = ['VS_rr', 'Time']
    hemodialysis_frame.drop(labels=drop_columns, inplace=True, axis=1)
    timestamp = pd.to_datetime(hemodialysis_frame['HD_duration']).dt
    hemodialysis_frame['HD_duration'] = timestamp.hour * 60 + timestamp.minute
    hemodialysis_frame.fillna(method='ffill', inplace=True)
    hemodialysis_frame['Pt_sex'] = hemodialysis_frame['Pt_sex'].replace({'M': 0, 'F': 1})
    hemodialysis_frame = pd.get_dummies(hemodialysis_frame, columns=categorical, prefix=categorical)
    hemodialysis_frame = hemodialysis_frame.loc[
        (hemodialysis_frame.VS_sbp > 0) & (hemodialysis_frame.VS_dbp > 0)]
    return hemodialysis_frame

In [26]:
def normalize(hemodialysis_frame, mean_for_normalize, std_for_normalize):
    print('Normalizing...')
    hemodialysis_frame['HD_ctime_raw'] = hemodialysis_frame['HD_ctime']
    hemodialysis_frame['HD_ntime_raw'] = hemodialysis_frame['HD_ntime']
    numerical_col = ['Pt_age', 'HD_ntime', 'HD_ctime', 'HD_prewt', 'HD_uf', 'VS_sbp', 'VS_dbp', 'VS_hr', 'VS_bt', 'VS_bfr', 'VS_uft', 'Lab_wbc', 'Lab_hb', 'Lab_plt', 'Lab_chol', 'Lab_alb', 'Lab_glu', 'Lab_ca', 'Lab_phos', 'Lab_ua', 'Lab_bun', 'Lab_scr', 'Lab_na', 'Lab_k', 'Lab_cl', 'Lab_co2']
    for col in numerical_col:
        mean_for_normalize[col] = hemodialysis_frame[col].mean()
        std_for_normalize[col] = hemodialysis_frame[col].std()
        if std_for_normalize[col] > 0:
            hemodialysis_frame[col] = (hemodialysis_frame[col] - mean_for_normalize[col]) / std_for_normalize[col]
        else:
            hemodialysis_frame[col] = 0

In [140]:
hemodialysis_frame = pd.DataFrame()
for f in files:
    tmp = pd.read_csv(os.path.join(root_dir, f), header=0)
    hemodialysis_frame = pd.concat([hemodialysis_frame, tmp])
hemodialysis_frame = refine_dataset(hemodialysis_frame)
mean_for_normalize = {}
std_for_normalize = {}
normalize(hemodialysis_frame, mean_for_normalize, std_for_normalize)

Normalizing...


In [106]:
hemodialysis_frame.head()

Unnamed: 0,ID_hd,ID_timeline,Pt_id,Pt_sex,Pt_age,HD_duration,HD_ntime,HD_ctime,HD_prewt,HD_uf,HD_hep,HD_fut,VS_sbp,VS_dbp,VS_hr,VS_bt,VS_bfr,VS_uft,CL_adm,CL_dm,CL_htn,CL_cad,CL_donor,CL_recipient,Lab_wbc,Lab_hb,Lab_plt,Lab_chol,Lab_alb,Lab_glu,Lab_ca,Lab_phos,Lab_ua,Lab_bun,Lab_scr,Lab_na,Lab_k,Lab_cl,Lab_co2,MED_bb,MED_ccb,MED_aceiarb,MED_spirono,MED_lasix,MED_statin,MED_minox,MED_aspirin,MED_plavix,MED_warfarin,MED_oha,MED_insulin,MED_allop,MED_febuxo,MED_epo,MED_pbindca,MED_pbindnoca,HD_type_HD,HD_type_HDF,HD_type_HDR,HD_type_HF,HD_type_HFR,HD_type_Hemoperfusion,HD_type_SUF,HD_acces_AVF,HD_acces_AVG,HD_acces_FVC,HD_acces_IJC,HD_acces_Others,HD_acces_Perm,HD_prim_blood,HD_prim_half,HD_prim_with,HD_prim_without,HD_dialysate_APS-15U,HD_dialysate_B DEX 0.15%,HD_dialysate_B Dex 0.15%,HD_dialysate_B dex 0.1%,HD_dialysate_B dex 0.1%.1,HD_dialysate_B dex 0.1%.2,HD_dialysate_B dex 0.15%,HD_dialysate_B dex 0.15%.1,HD_dialysate_B1/5L,HD_dialysate_B1/5L.1,HD_dialysate_Blue,HD_dialysate_Fx5,HD_dialysate_NC1485,HD_dialyzer_APS-15U,HD_dialyzer_APS-15u,HD_dialyzer_APS-21U,HD_dialyzer_Adsorba,HD_dialyzer_B dex 0.15%,HD_dialyzer_B1/5L,HD_dialyzer_BLD816SD,HD_dialyzer_BLD819SD,HD_dialyzer_BLS 812G,HD_dialyzer_BLS12G,HD_dialyzer_BLS14SD,HD_dialyzer_BLS812G,HD_dialyzer_BLS812SD,HD_dialyzer_BLS814SD,HD_dialyzer_BLS816SD,HD_dialyzer_BLS819,HD_dialyzer_BLS819SD,HD_dialyzer_F4 HPS,HD_dialyzer_F5 HPS,HD_dialyzer_F6 HPS,HD_dialyzer_FB130T,HD_dialyzer_FX,HD_dialyzer_FX paed,HD_dialyzer_FX40,HD_dialyzer_FX5,HD_dialyzer_FX50,HD_dialyzer_FX8,HD_dialyzer_FX80,HD_dialyzer_Fx5,HD_dialyzer_NC1485,HD_dialyzer_PHF0714,HD_dialyzer_Polyflux 14L,HD_dialyzer_Polyflux 6H,HD_dialyzer_Rexeed-13LX,HD_dialyzer_Rexeed-18LX,HD_dialyzer_SG30,HD_dialyzer_Sureflux 130E-GA,HD_dialyzer_Theranova 400,HD_dialyzer_fx5,HD_dialyzer_polyflux 14,HD_dialyzer_polyflux 14H,HD_dialyzer_polyflux 14L,HD_dialyzer_polyflux 14L.1,HD_dialyzer_polyflux 14S,HD_dialyzer_polyflux 170H,HD_dialyzer_polyflux 17L,HD_dialyzer_polyflux 17S,HD_dialyzer_polyflux 6H,HD_dialyzer_polyflux 8L,HD_dialyzer_polyflux s,HD_ctime_raw,HD_ntime_raw
0,7712599201309280,*07712599*20130928*1435,7712599,0,-1.347976,120,-1.287793,-1.428803,1.192881,-0.512735,0,1,0.733904,0.818963,0.465706,0.658684,-2.601968,-0.995073,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,7712599201309280,*07712599*20130928*1448,7712599,0,-1.347976,120,-0.758557,-1.264009,1.192881,-0.512735,0,1,0.77113,0.889489,0.234063,0.658684,-2.601968,-0.592402,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,13,13
2,7712599201309280,*07712599*20130928*1518,7712599,0,-1.347976,120,-0.066479,-0.883716,1.192881,-0.512735,0,1,1.068935,1.030542,0.002421,0.658684,-2.601968,-0.592402,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,43,30
3,7712599201309280,*07712599*20130928*1535,7712599,0,-1.347976,120,-0.595715,-0.668216,1.192881,-0.512735,0,1,1.068935,1.171595,0.118242,0.658684,-2.601968,-0.592402,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,60,17
4,7712599201309280,*07712599*20130928*1548,7712599,0,-1.347976,120,-0.758557,-0.503422,1.192881,-0.512735,0,1,1.143386,1.383174,0.234063,0.658684,-2.601968,-0.592402,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,73,13


In [125]:
hemodialysis_frame.shape

(1863552, 138)

In [126]:
def make_sequence(hemodialysis_frame):
    hemodialysis_frame['time'] = [dt.datetime.strptime(x[10:], '%Y%m%d*%H%M') for x in hemodialysis_frame['ID_timeline']]
    hemodialysis_frame['prev_time'] = hemodialysis_frame.apply(lambda x: x.time - dt.timedelta(minutes=x.HD_ntime_raw), axis=1)
    target_data = hemodialysis_frame.loc[hemodialysis_frame.HD_ctime_raw > 0][['ID_hd', 'prev_time', 'HD_ntime', 'VS_sbp', 'VS_dbp']]
    init_data = hemodialysis_frame.loc[hemodialysis_frame.HD_ctime_raw == 0][['ID_hd', 'VS_sbp', 'VS_dbp']]
    print(target_data.shape)
    print(init_data.shape)
    print('merge!')
    hemodialysis_frame = hemodialysis_frame.merge(target_data, how='inner', left_on=['ID_hd', 'time'], right_on=['ID_hd', 'prev_time'], suffixes=('', '_target'))
    print(hemodialysis_frame.shape)
    hemodialysis_frame = init_data.merge(hemodialysis_frame, how='inner', on=['ID_hd'], suffixes=('_init', ''))
    print(hemodialysis_frame.shape)
    drop_columns = ['ID_timeline', 'HD_ntime_raw', 'time', 'prev_time', 'prev_time_target']
    hemodialysis_frame.drop(labels=drop_columns, axis=1, inplace=True)
    print(hemodialysis_frame.shape)
    return hemodialysis_frame

In [130]:
def add_target_class(hemodialysis_frame, std_for_normalize):
    def eval_target(diff, type):
        if type == 'sbp':
            if diff < -20 :
                return 0
            if diff < -10 :
                return 1
            if diff < -5 :
                return 2
            if diff < 5 :
                return 3
            else:
                return 4
        if type == 'dbp':
            if diff < -10:
                return 0
            if diff < -5:
                return 1
            if diff < 5:
                return 2
            else:
                return 3

    hemodialysis_frame['VS_sbp_target_class'] = (
                (hemodialysis_frame['VS_sbp_target'] - hemodialysis_frame['VS_sbp']) * std_for_normalize[
            'VS_sbp']).apply(lambda x: eval_target(x, 'sbp'))
    hemodialysis_frame['VS_dbp_target_class'] = (
                (hemodialysis_frame['VS_dbp_target'] - hemodialysis_frame['VS_dbp']) * std_for_normalize[
            'VS_dbp']).apply(lambda x: eval_target(x, 'dbp'))

In [141]:
hemodialysis_frame = make_sequence(hemodialysis_frame)

(1601765, 5)
(261787, 3)
merge!
(1601736, 144)
(1601702, 146)
(1601702, 141)


In [142]:
hemodialysis_frame.shape

(1601702, 141)

In [143]:
add_target_class(hemodialysis_frame, std_for_normalize)

In [144]:
hemodialysis_frame.head()

Unnamed: 0,ID_hd,VS_sbp_init,VS_dbp_init,Pt_id,Pt_sex,Pt_age,HD_duration,HD_ntime,HD_ctime,HD_prewt,HD_uf,HD_hep,HD_fut,VS_sbp,VS_dbp,VS_hr,VS_bt,VS_bfr,VS_uft,CL_adm,CL_dm,CL_htn,CL_cad,CL_donor,CL_recipient,Lab_wbc,Lab_hb,Lab_plt,Lab_chol,Lab_alb,Lab_glu,Lab_ca,Lab_phos,Lab_ua,Lab_bun,Lab_scr,Lab_na,Lab_k,Lab_cl,Lab_co2,MED_bb,MED_ccb,MED_aceiarb,MED_spirono,MED_lasix,MED_statin,MED_minox,MED_aspirin,MED_plavix,MED_warfarin,MED_oha,MED_insulin,MED_allop,MED_febuxo,MED_epo,MED_pbindca,MED_pbindnoca,HD_type_HD,HD_type_HDF,HD_type_HDR,HD_type_HF,HD_type_HFR,HD_type_Hemoperfusion,HD_type_SUF,HD_acces_AVF,HD_acces_AVG,HD_acces_FVC,HD_acces_IJC,HD_acces_Others,HD_acces_Perm,HD_prim_blood,HD_prim_half,HD_prim_with,HD_prim_without,HD_dialysate_APS-15U,HD_dialysate_B DEX 0.15%,HD_dialysate_B Dex 0.15%,HD_dialysate_B dex 0.1%,HD_dialysate_B dex 0.1%.1,HD_dialysate_B dex 0.1%.2,HD_dialysate_B dex 0.15%,HD_dialysate_B dex 0.15%.1,HD_dialysate_B1/5L,HD_dialysate_B1/5L.1,HD_dialysate_Blue,HD_dialysate_Fx5,HD_dialysate_NC1485,HD_dialyzer_APS-15U,HD_dialyzer_APS-15u,HD_dialyzer_APS-21U,HD_dialyzer_Adsorba,HD_dialyzer_B dex 0.15%,HD_dialyzer_B1/5L,HD_dialyzer_BLD816SD,HD_dialyzer_BLD819SD,HD_dialyzer_BLS 812G,HD_dialyzer_BLS12G,HD_dialyzer_BLS14SD,HD_dialyzer_BLS812G,HD_dialyzer_BLS812SD,HD_dialyzer_BLS814SD,HD_dialyzer_BLS816SD,HD_dialyzer_BLS819,HD_dialyzer_BLS819SD,HD_dialyzer_F4 HPS,HD_dialyzer_F5 HPS,HD_dialyzer_F6 HPS,HD_dialyzer_FB130T,HD_dialyzer_FX,HD_dialyzer_FX paed,HD_dialyzer_FX40,HD_dialyzer_FX5,HD_dialyzer_FX50,HD_dialyzer_FX8,HD_dialyzer_FX80,HD_dialyzer_Fx5,HD_dialyzer_NC1485,HD_dialyzer_PHF0714,HD_dialyzer_Polyflux 14L,HD_dialyzer_Polyflux 6H,HD_dialyzer_Rexeed-13LX,HD_dialyzer_Rexeed-18LX,HD_dialyzer_SG30,HD_dialyzer_Sureflux 130E-GA,HD_dialyzer_Theranova 400,HD_dialyzer_fx5,HD_dialyzer_polyflux 14,HD_dialyzer_polyflux 14H,HD_dialyzer_polyflux 14L,HD_dialyzer_polyflux 14L.1,HD_dialyzer_polyflux 14S,HD_dialyzer_polyflux 170H,HD_dialyzer_polyflux 17L,HD_dialyzer_polyflux 17S,HD_dialyzer_polyflux 6H,HD_dialyzer_polyflux 8L,HD_dialyzer_polyflux s,HD_ctime_raw,HD_ntime_target,VS_sbp_target,VS_dbp_target,VS_sbp_target_class,VS_dbp_target_class
0,7712599201309280,0.733904,0.818963,7712599,0,-1.347976,120,-1.287793,-1.428803,1.192881,-0.512735,0,1,0.733904,0.818963,0.465706,0.658684,-2.601968,-0.995073,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,-0.758557,0.77113,0.889489,3,2
1,7712599201309280,0.733904,0.818963,7712599,0,-1.347976,120,-0.758557,-1.264009,1.192881,-0.512735,0,1,0.77113,0.889489,0.234063,0.658684,-2.601968,-0.592402,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,13,-0.066479,1.068935,1.030542,4,2
2,7712599201309280,0.733904,0.818963,7712599,0,-1.347976,120,-0.066479,-0.883716,1.192881,-0.512735,0,1,1.068935,1.030542,0.002421,0.658684,-2.601968,-0.592402,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,43,-0.595715,1.068935,1.171595,3,2
3,7712599201309280,0.733904,0.818963,7712599,0,-1.347976,120,-0.595715,-0.668216,1.192881,-0.512735,0,1,1.068935,1.171595,0.118242,0.658684,-2.601968,-0.592402,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,60,-0.758557,1.143386,1.383174,3,2
4,7712599201309280,0.733904,0.818963,7712599,0,-1.347976,120,-0.758557,-0.503422,1.192881,-0.512735,0,1,1.143386,1.383174,0.234063,0.658684,-2.601968,-0.592402,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,73,-0.066479,1.217837,1.242121,3,2


In [145]:
def add_pre_hemodialysis(hemodialysis_frame):
    min_bp = hemodialysis_frame.groupby(['Pt_id', 'ID_hd'])['VS_sbp', 'VS_dbp'].min().reset_index()
    max_bp = hemodialysis_frame.groupby(['Pt_id', 'ID_hd'])['VS_sbp', 'VS_dbp'].max().reset_index()
    init_bp = hemodialysis_frame.loc[hemodialysis_frame.HD_ctime_raw == 0][['Pt_id', 'ID_hd', 'VS_sbp', 'VS_dbp']]
    init_bp.columns = ['Pt_id', 'ID_hd', 'VS_sbp_init_in_pre_hd', 'VS_dbp_init_in_pre_hd']
    pre_hd = min_bp.merge(max_bp, how='inner', on=['Pt_id', 'ID_hd'], suffixes=('_min_in_pre_hd', '_max_in_pre_hd'))
    pre_hd = init_bp.merge(pre_hd, how='inner', on=['Pt_id', 'ID_hd'])
    pre_hd['rank'] = pre_hd.sort_values(['Pt_id', 'ID_hd'], ascending=[True, True]).groupby(['Pt_id']).cumcount() + 1
    del min_bp, max_bp, init_bp
    print(hemodialysis_frame.shape)
    hemodialysis_frame = hemodialysis_frame.merge(pre_hd[['Pt_id', 'ID_hd', 'rank']], on=['Pt_id', 'ID_hd'], how='inner')
    print(hemodialysis_frame.shape)
    hemodialysis_frame['rank'] -= 1
    hemodialysis_frame = hemodialysis_frame.merge(pre_hd, on=['Pt_id', 'rank'], how='left', suffixes=('', '_pre'))
    print(hemodialysis_frame.shape)
    hemodialysis_frame['pre_hd'] = [0 if np.isnan(x) else 1 for x in hemodialysis_frame['VS_sbp_min_in_pre_hd']]
    print(hemodialysis_frame.shape)
    hemodialysis_frame.drop(labels=['Pt_id', 'rank', 'ID_hd_pre', 'HD_ctime_raw'], axis=1, inplace=True)
    print(hemodialysis_frame.shape)
    hemodialysis_frame.fillna(0.0, inplace=True)
    return hemodialysis_frame

In [146]:
hemodialysis_frame = add_pre_hemodialysis(hemodialysis_frame)

(1601702, 143)
(1601699, 144)
(1601699, 151)
(1601699, 152)
(1601699, 148)


In [147]:
hemodialysis_frame.shape

(1601699, 148)

In [148]:
hemodialysis_frame.head()

Unnamed: 0,ID_hd,VS_sbp_init,VS_dbp_init,Pt_sex,Pt_age,HD_duration,HD_ntime,HD_ctime,HD_prewt,HD_uf,HD_hep,HD_fut,VS_sbp,VS_dbp,VS_hr,VS_bt,VS_bfr,VS_uft,CL_adm,CL_dm,CL_htn,CL_cad,CL_donor,CL_recipient,Lab_wbc,Lab_hb,Lab_plt,Lab_chol,Lab_alb,Lab_glu,Lab_ca,Lab_phos,Lab_ua,Lab_bun,Lab_scr,Lab_na,Lab_k,Lab_cl,Lab_co2,MED_bb,MED_ccb,MED_aceiarb,MED_spirono,MED_lasix,MED_statin,MED_minox,MED_aspirin,MED_plavix,MED_warfarin,MED_oha,MED_insulin,MED_allop,MED_febuxo,MED_epo,MED_pbindca,MED_pbindnoca,HD_type_HD,HD_type_HDF,HD_type_HDR,HD_type_HF,HD_type_HFR,HD_type_Hemoperfusion,HD_type_SUF,HD_acces_AVF,HD_acces_AVG,HD_acces_FVC,HD_acces_IJC,HD_acces_Others,HD_acces_Perm,HD_prim_blood,HD_prim_half,HD_prim_with,HD_prim_without,HD_dialysate_APS-15U,HD_dialysate_B DEX 0.15%,HD_dialysate_B Dex 0.15%,HD_dialysate_B dex 0.1%,HD_dialysate_B dex 0.1%.1,HD_dialysate_B dex 0.1%.2,HD_dialysate_B dex 0.15%,HD_dialysate_B dex 0.15%.1,HD_dialysate_B1/5L,HD_dialysate_B1/5L.1,HD_dialysate_Blue,HD_dialysate_Fx5,HD_dialysate_NC1485,HD_dialyzer_APS-15U,HD_dialyzer_APS-15u,HD_dialyzer_APS-21U,HD_dialyzer_Adsorba,HD_dialyzer_B dex 0.15%,HD_dialyzer_B1/5L,HD_dialyzer_BLD816SD,HD_dialyzer_BLD819SD,HD_dialyzer_BLS 812G,HD_dialyzer_BLS12G,HD_dialyzer_BLS14SD,HD_dialyzer_BLS812G,HD_dialyzer_BLS812SD,HD_dialyzer_BLS814SD,HD_dialyzer_BLS816SD,HD_dialyzer_BLS819,HD_dialyzer_BLS819SD,HD_dialyzer_F4 HPS,HD_dialyzer_F5 HPS,HD_dialyzer_F6 HPS,HD_dialyzer_FB130T,HD_dialyzer_FX,HD_dialyzer_FX paed,HD_dialyzer_FX40,HD_dialyzer_FX5,HD_dialyzer_FX50,HD_dialyzer_FX8,HD_dialyzer_FX80,HD_dialyzer_Fx5,HD_dialyzer_NC1485,HD_dialyzer_PHF0714,HD_dialyzer_Polyflux 14L,HD_dialyzer_Polyflux 6H,HD_dialyzer_Rexeed-13LX,HD_dialyzer_Rexeed-18LX,HD_dialyzer_SG30,HD_dialyzer_Sureflux 130E-GA,HD_dialyzer_Theranova 400,HD_dialyzer_fx5,HD_dialyzer_polyflux 14,HD_dialyzer_polyflux 14H,HD_dialyzer_polyflux 14L,HD_dialyzer_polyflux 14L.1,HD_dialyzer_polyflux 14S,HD_dialyzer_polyflux 170H,HD_dialyzer_polyflux 17L,HD_dialyzer_polyflux 17S,HD_dialyzer_polyflux 6H,HD_dialyzer_polyflux 8L,HD_dialyzer_polyflux s,HD_ntime_target,VS_sbp_target,VS_dbp_target,VS_sbp_target_class,VS_dbp_target_class,VS_sbp_init_in_pre_hd,VS_dbp_init_in_pre_hd,VS_sbp_min_in_pre_hd,VS_dbp_min_in_pre_hd,VS_sbp_max_in_pre_hd,VS_dbp_max_in_pre_hd,pre_hd
0,7712599201309280,0.733904,0.818963,0,-1.347976,120,-1.287793,-1.428803,1.192881,-0.512735,0,1,0.733904,0.818963,0.465706,0.658684,-2.601968,-0.995073,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.758557,0.77113,0.889489,3,2,0.0,0.0,0.0,0.0,0.0,0.0,0
1,7712599201309280,0.733904,0.818963,0,-1.347976,120,-0.758557,-1.264009,1.192881,-0.512735,0,1,0.77113,0.889489,0.234063,0.658684,-2.601968,-0.592402,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.066479,1.068935,1.030542,4,2,0.0,0.0,0.0,0.0,0.0,0.0,0
2,7712599201309280,0.733904,0.818963,0,-1.347976,120,-0.066479,-0.883716,1.192881,-0.512735,0,1,1.068935,1.030542,0.002421,0.658684,-2.601968,-0.592402,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.595715,1.068935,1.171595,3,2,0.0,0.0,0.0,0.0,0.0,0.0,0
3,7712599201309280,0.733904,0.818963,0,-1.347976,120,-0.595715,-0.668216,1.192881,-0.512735,0,1,1.068935,1.171595,0.118242,0.658684,-2.601968,-0.592402,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.758557,1.143386,1.383174,3,2,0.0,0.0,0.0,0.0,0.0,0.0,0
4,7712599201309280,0.733904,0.818963,0,-1.347976,120,-0.758557,-0.503422,1.192881,-0.512735,0,1,1.143386,1.383174,0.234063,0.658684,-2.601968,-0.592402,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.066479,1.217837,1.242121,3,2,0.0,0.0,0.0,0.0,0.0,0.0,0


In [149]:
def order_target_column(hemodialysis_frame):
    target_columns = ['VS_sbp_target', 'VS_dbp_target', 'VS_sbp_target_class', 'VS_dbp_target_class']
    input_columns = hemodialysis_frame.columns[
        [False if x in target_columns else True for x in hemodialysis_frame.columns]].to_list()
    hemodialysis_frame = hemodialysis_frame[input_columns + target_columns]
    return hemodialysis_frame

In [150]:
hemodialysis_frame = order_target_column(hemodialysis_frame)

In [152]:
hemodialysis_frame.shape

(1601699, 148)

In [153]:
hemodialysis_frame.head()

Unnamed: 0,ID_hd,VS_sbp_init,VS_dbp_init,Pt_sex,Pt_age,HD_duration,HD_ntime,HD_ctime,HD_prewt,HD_uf,HD_hep,HD_fut,VS_sbp,VS_dbp,VS_hr,VS_bt,VS_bfr,VS_uft,CL_adm,CL_dm,CL_htn,CL_cad,CL_donor,CL_recipient,Lab_wbc,Lab_hb,Lab_plt,Lab_chol,Lab_alb,Lab_glu,Lab_ca,Lab_phos,Lab_ua,Lab_bun,Lab_scr,Lab_na,Lab_k,Lab_cl,Lab_co2,MED_bb,MED_ccb,MED_aceiarb,MED_spirono,MED_lasix,MED_statin,MED_minox,MED_aspirin,MED_plavix,MED_warfarin,MED_oha,MED_insulin,MED_allop,MED_febuxo,MED_epo,MED_pbindca,MED_pbindnoca,HD_type_HD,HD_type_HDF,HD_type_HDR,HD_type_HF,HD_type_HFR,HD_type_Hemoperfusion,HD_type_SUF,HD_acces_AVF,HD_acces_AVG,HD_acces_FVC,HD_acces_IJC,HD_acces_Others,HD_acces_Perm,HD_prim_blood,HD_prim_half,HD_prim_with,HD_prim_without,HD_dialysate_APS-15U,HD_dialysate_B DEX 0.15%,HD_dialysate_B Dex 0.15%,HD_dialysate_B dex 0.1%,HD_dialysate_B dex 0.1%.1,HD_dialysate_B dex 0.1%.2,HD_dialysate_B dex 0.15%,HD_dialysate_B dex 0.15%.1,HD_dialysate_B1/5L,HD_dialysate_B1/5L.1,HD_dialysate_Blue,HD_dialysate_Fx5,HD_dialysate_NC1485,HD_dialyzer_APS-15U,HD_dialyzer_APS-15u,HD_dialyzer_APS-21U,HD_dialyzer_Adsorba,HD_dialyzer_B dex 0.15%,HD_dialyzer_B1/5L,HD_dialyzer_BLD816SD,HD_dialyzer_BLD819SD,HD_dialyzer_BLS 812G,HD_dialyzer_BLS12G,HD_dialyzer_BLS14SD,HD_dialyzer_BLS812G,HD_dialyzer_BLS812SD,HD_dialyzer_BLS814SD,HD_dialyzer_BLS816SD,HD_dialyzer_BLS819,HD_dialyzer_BLS819SD,HD_dialyzer_F4 HPS,HD_dialyzer_F5 HPS,HD_dialyzer_F6 HPS,HD_dialyzer_FB130T,HD_dialyzer_FX,HD_dialyzer_FX paed,HD_dialyzer_FX40,HD_dialyzer_FX5,HD_dialyzer_FX50,HD_dialyzer_FX8,HD_dialyzer_FX80,HD_dialyzer_Fx5,HD_dialyzer_NC1485,HD_dialyzer_PHF0714,HD_dialyzer_Polyflux 14L,HD_dialyzer_Polyflux 6H,HD_dialyzer_Rexeed-13LX,HD_dialyzer_Rexeed-18LX,HD_dialyzer_SG30,HD_dialyzer_Sureflux 130E-GA,HD_dialyzer_Theranova 400,HD_dialyzer_fx5,HD_dialyzer_polyflux 14,HD_dialyzer_polyflux 14H,HD_dialyzer_polyflux 14L,HD_dialyzer_polyflux 14L.1,HD_dialyzer_polyflux 14S,HD_dialyzer_polyflux 170H,HD_dialyzer_polyflux 17L,HD_dialyzer_polyflux 17S,HD_dialyzer_polyflux 6H,HD_dialyzer_polyflux 8L,HD_dialyzer_polyflux s,HD_ntime_target,VS_sbp_init_in_pre_hd,VS_dbp_init_in_pre_hd,VS_sbp_min_in_pre_hd,VS_dbp_min_in_pre_hd,VS_sbp_max_in_pre_hd,VS_dbp_max_in_pre_hd,pre_hd,VS_sbp_target,VS_dbp_target,VS_sbp_target_class,VS_dbp_target_class
0,7712599201309280,0.733904,0.818963,0,-1.347976,120,-1.287793,-1.428803,1.192881,-0.512735,0,1,0.733904,0.818963,0.465706,0.658684,-2.601968,-0.995073,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.758557,0.0,0.0,0.0,0.0,0.0,0.0,0,0.77113,0.889489,3,2
1,7712599201309280,0.733904,0.818963,0,-1.347976,120,-0.758557,-1.264009,1.192881,-0.512735,0,1,0.77113,0.889489,0.234063,0.658684,-2.601968,-0.592402,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.066479,0.0,0.0,0.0,0.0,0.0,0.0,0,1.068935,1.030542,4,2
2,7712599201309280,0.733904,0.818963,0,-1.347976,120,-0.066479,-0.883716,1.192881,-0.512735,0,1,1.068935,1.030542,0.002421,0.658684,-2.601968,-0.592402,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.595715,0.0,0.0,0.0,0.0,0.0,0.0,0,1.068935,1.171595,3,2
3,7712599201309280,0.733904,0.818963,0,-1.347976,120,-0.595715,-0.668216,1.192881,-0.512735,0,1,1.068935,1.171595,0.118242,0.658684,-2.601968,-0.592402,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.758557,0.0,0.0,0.0,0.0,0.0,0.0,0,1.143386,1.383174,3,2
4,7712599201309280,0.733904,0.818963,0,-1.347976,120,-0.758557,-0.503422,1.192881,-0.512735,0,1,1.143386,1.383174,0.234063,0.658684,-2.601968,-0.592402,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.066479,0.0,0.0,0.0,0.0,0.0,0.0,0,1.217837,1.242121,3,2


In [157]:
def split_train_test(hemodialysis_frame):
    print("Splitting...")
    np.random.seed(212)
    key = hemodialysis_frame['ID_hd'].unique()
    idx = np.random.permutation(range(len(key)))
    train_split, val_split = int(np.floor(0.7 * len(key))), int(np.floor(0.8 * len(key)))
    train_idx, val_idx, test_idx = idx[:train_split], idx[train_split:val_split], idx[val_split:]
    training_type = pd.DataFrame(data=key, columns=['ID_hd'])
    training_type['ID_class'] = 0; training_type.loc[train_idx, 'ID_class'] = 'Train'; training_type.loc[val_idx, 'ID_class'] = 'Validation'; training_type.loc[test_idx, 'ID_class'] = 'Test'
    hemodialysis_frame = training_type.merge(hemodialysis_frame, how='inner', left_on=["ID_hd"], right_on=["ID_hd"])
    return hemodialysis_frame

In [158]:
hemodialysis_frame = split_train_test(hemodialysis_frame)

Splitting...


In [159]:
hemodialysis_frame.shape

(1601699, 149)

In [160]:
hemodialysis_frame.head()

Unnamed: 0,ID_hd,ID_class,VS_sbp_init,VS_dbp_init,Pt_sex,Pt_age,HD_duration,HD_ntime,HD_ctime,HD_prewt,HD_uf,HD_hep,HD_fut,VS_sbp,VS_dbp,VS_hr,VS_bt,VS_bfr,VS_uft,CL_adm,CL_dm,CL_htn,CL_cad,CL_donor,CL_recipient,Lab_wbc,Lab_hb,Lab_plt,Lab_chol,Lab_alb,Lab_glu,Lab_ca,Lab_phos,Lab_ua,Lab_bun,Lab_scr,Lab_na,Lab_k,Lab_cl,Lab_co2,MED_bb,MED_ccb,MED_aceiarb,MED_spirono,MED_lasix,MED_statin,MED_minox,MED_aspirin,MED_plavix,MED_warfarin,MED_oha,MED_insulin,MED_allop,MED_febuxo,MED_epo,MED_pbindca,MED_pbindnoca,HD_type_HD,HD_type_HDF,HD_type_HDR,HD_type_HF,HD_type_HFR,HD_type_Hemoperfusion,HD_type_SUF,HD_acces_AVF,HD_acces_AVG,HD_acces_FVC,HD_acces_IJC,HD_acces_Others,HD_acces_Perm,HD_prim_blood,HD_prim_half,HD_prim_with,HD_prim_without,HD_dialysate_APS-15U,HD_dialysate_B DEX 0.15%,HD_dialysate_B Dex 0.15%,HD_dialysate_B dex 0.1%,HD_dialysate_B dex 0.1%.1,HD_dialysate_B dex 0.1%.2,HD_dialysate_B dex 0.15%,HD_dialysate_B dex 0.15%.1,HD_dialysate_B1/5L,HD_dialysate_B1/5L.1,HD_dialysate_Blue,HD_dialysate_Fx5,HD_dialysate_NC1485,HD_dialyzer_APS-15U,HD_dialyzer_APS-15u,HD_dialyzer_APS-21U,HD_dialyzer_Adsorba,HD_dialyzer_B dex 0.15%,HD_dialyzer_B1/5L,HD_dialyzer_BLD816SD,HD_dialyzer_BLD819SD,HD_dialyzer_BLS 812G,HD_dialyzer_BLS12G,HD_dialyzer_BLS14SD,HD_dialyzer_BLS812G,HD_dialyzer_BLS812SD,HD_dialyzer_BLS814SD,HD_dialyzer_BLS816SD,HD_dialyzer_BLS819,HD_dialyzer_BLS819SD,HD_dialyzer_F4 HPS,HD_dialyzer_F5 HPS,HD_dialyzer_F6 HPS,HD_dialyzer_FB130T,HD_dialyzer_FX,HD_dialyzer_FX paed,HD_dialyzer_FX40,HD_dialyzer_FX5,HD_dialyzer_FX50,HD_dialyzer_FX8,HD_dialyzer_FX80,HD_dialyzer_Fx5,HD_dialyzer_NC1485,HD_dialyzer_PHF0714,HD_dialyzer_Polyflux 14L,HD_dialyzer_Polyflux 6H,HD_dialyzer_Rexeed-13LX,HD_dialyzer_Rexeed-18LX,HD_dialyzer_SG30,HD_dialyzer_Sureflux 130E-GA,HD_dialyzer_Theranova 400,HD_dialyzer_fx5,HD_dialyzer_polyflux 14,HD_dialyzer_polyflux 14H,HD_dialyzer_polyflux 14L,HD_dialyzer_polyflux 14L.1,HD_dialyzer_polyflux 14S,HD_dialyzer_polyflux 170H,HD_dialyzer_polyflux 17L,HD_dialyzer_polyflux 17S,HD_dialyzer_polyflux 6H,HD_dialyzer_polyflux 8L,HD_dialyzer_polyflux s,HD_ntime_target,VS_sbp_init_in_pre_hd,VS_dbp_init_in_pre_hd,VS_sbp_min_in_pre_hd,VS_dbp_min_in_pre_hd,VS_sbp_max_in_pre_hd,VS_dbp_max_in_pre_hd,pre_hd,VS_sbp_target,VS_dbp_target,VS_sbp_target_class,VS_dbp_target_class
0,7712599201309280,Train,0.733904,0.818963,0,-1.347976,120,-1.287793,-1.428803,1.192881,-0.512735,0,1,0.733904,0.818963,0.465706,0.658684,-2.601968,-0.995073,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.758557,0.0,0.0,0.0,0.0,0.0,0.0,0,0.77113,0.889489,3,2
1,7712599201309280,Train,0.733904,0.818963,0,-1.347976,120,-0.758557,-1.264009,1.192881,-0.512735,0,1,0.77113,0.889489,0.234063,0.658684,-2.601968,-0.592402,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.066479,0.0,0.0,0.0,0.0,0.0,0.0,0,1.068935,1.030542,4,2
2,7712599201309280,Train,0.733904,0.818963,0,-1.347976,120,-0.066479,-0.883716,1.192881,-0.512735,0,1,1.068935,1.030542,0.002421,0.658684,-2.601968,-0.592402,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.595715,0.0,0.0,0.0,0.0,0.0,0.0,0,1.068935,1.171595,3,2
3,7712599201309280,Train,0.733904,0.818963,0,-1.347976,120,-0.595715,-0.668216,1.192881,-0.512735,0,1,1.068935,1.171595,0.118242,0.658684,-2.601968,-0.592402,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.758557,0.0,0.0,0.0,0.0,0.0,0.0,0,1.143386,1.383174,3,2
4,7712599201309280,Train,0.733904,0.818963,0,-1.347976,120,-0.758557,-0.503422,1.192881,-0.512735,0,1,1.143386,1.383174,0.234063,0.658684,-2.601968,-0.592402,1,0,0,0.0,0.0,0.0,1.371115,0.685251,1.258131,0.344324,-0.771488,-1.240124,-0.438955,1.168972,0.038065,0.221689,0.016399,-1.017,0.021743,1.107358,-2.218833,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.066479,0.0,0.0,0.0,0.0,0.0,0.0,0,1.217837,1.242121,3,2


In [161]:
columns = list(filter(lambda x: x not in ['ID_class', 'ID_hd'], hemodialysis_frame.columns))
columns

['VS_sbp_init',
 'VS_dbp_init',
 'Pt_sex',
 'Pt_age',
 'HD_duration',
 'HD_ntime',
 'HD_ctime',
 'HD_prewt',
 'HD_uf',
 'HD_hep',
 'HD_fut',
 'VS_sbp',
 'VS_dbp',
 'VS_hr',
 'VS_bt',
 'VS_bfr',
 'VS_uft',
 'CL_adm',
 'CL_dm',
 'CL_htn',
 'CL_cad',
 'CL_donor',
 'CL_recipient',
 'Lab_wbc',
 'Lab_hb',
 'Lab_plt',
 'Lab_chol',
 'Lab_alb',
 'Lab_glu',
 'Lab_ca',
 'Lab_phos',
 'Lab_ua',
 'Lab_bun',
 'Lab_scr',
 'Lab_na',
 'Lab_k',
 'Lab_cl',
 'Lab_co2',
 'MED_bb',
 'MED_ccb',
 'MED_aceiarb',
 'MED_spirono',
 'MED_lasix',
 'MED_statin',
 'MED_minox',
 'MED_aspirin',
 'MED_plavix',
 'MED_warfarin',
 'MED_oha',
 'MED_insulin',
 'MED_allop',
 'MED_febuxo',
 'MED_epo',
 'MED_pbindca',
 'MED_pbindnoca',
 'HD_type_HD',
 'HD_type_HDF',
 'HD_type_HDR',
 'HD_type_HF',
 'HD_type_HFR',
 'HD_type_Hemoperfusion',
 'HD_type_SUF',
 'HD_acces_AVF',
 'HD_acces_AVG',
 'HD_acces_FVC',
 'HD_acces_IJC',
 'HD_acces_Others',
 'HD_acces_Perm',
 'HD_prim_blood',
 'HD_prim_half',
 'HD_prim_with',
 'HD_prim_without',


In [162]:
df = hemodialysis_frame.loc[hemodialysis_frame.ID_class == 'Validation'].copy()

In [163]:
df.head()

Unnamed: 0,ID_hd,ID_class,VS_sbp_init,VS_dbp_init,Pt_sex,Pt_age,HD_duration,HD_ntime,HD_ctime,HD_prewt,HD_uf,HD_hep,HD_fut,VS_sbp,VS_dbp,VS_hr,VS_bt,VS_bfr,VS_uft,CL_adm,CL_dm,CL_htn,CL_cad,CL_donor,CL_recipient,Lab_wbc,Lab_hb,Lab_plt,Lab_chol,Lab_alb,Lab_glu,Lab_ca,Lab_phos,Lab_ua,Lab_bun,Lab_scr,Lab_na,Lab_k,Lab_cl,Lab_co2,MED_bb,MED_ccb,MED_aceiarb,MED_spirono,MED_lasix,MED_statin,MED_minox,MED_aspirin,MED_plavix,MED_warfarin,MED_oha,MED_insulin,MED_allop,MED_febuxo,MED_epo,MED_pbindca,MED_pbindnoca,HD_type_HD,HD_type_HDF,HD_type_HDR,HD_type_HF,HD_type_HFR,HD_type_Hemoperfusion,HD_type_SUF,HD_acces_AVF,HD_acces_AVG,HD_acces_FVC,HD_acces_IJC,HD_acces_Others,HD_acces_Perm,HD_prim_blood,HD_prim_half,HD_prim_with,HD_prim_without,HD_dialysate_APS-15U,HD_dialysate_B DEX 0.15%,HD_dialysate_B Dex 0.15%,HD_dialysate_B dex 0.1%,HD_dialysate_B dex 0.1%.1,HD_dialysate_B dex 0.1%.2,HD_dialysate_B dex 0.15%,HD_dialysate_B dex 0.15%.1,HD_dialysate_B1/5L,HD_dialysate_B1/5L.1,HD_dialysate_Blue,HD_dialysate_Fx5,HD_dialysate_NC1485,HD_dialyzer_APS-15U,HD_dialyzer_APS-15u,HD_dialyzer_APS-21U,HD_dialyzer_Adsorba,HD_dialyzer_B dex 0.15%,HD_dialyzer_B1/5L,HD_dialyzer_BLD816SD,HD_dialyzer_BLD819SD,HD_dialyzer_BLS 812G,HD_dialyzer_BLS12G,HD_dialyzer_BLS14SD,HD_dialyzer_BLS812G,HD_dialyzer_BLS812SD,HD_dialyzer_BLS814SD,HD_dialyzer_BLS816SD,HD_dialyzer_BLS819,HD_dialyzer_BLS819SD,HD_dialyzer_F4 HPS,HD_dialyzer_F5 HPS,HD_dialyzer_F6 HPS,HD_dialyzer_FB130T,HD_dialyzer_FX,HD_dialyzer_FX paed,HD_dialyzer_FX40,HD_dialyzer_FX5,HD_dialyzer_FX50,HD_dialyzer_FX8,HD_dialyzer_FX80,HD_dialyzer_Fx5,HD_dialyzer_NC1485,HD_dialyzer_PHF0714,HD_dialyzer_Polyflux 14L,HD_dialyzer_Polyflux 6H,HD_dialyzer_Rexeed-13LX,HD_dialyzer_Rexeed-18LX,HD_dialyzer_SG30,HD_dialyzer_Sureflux 130E-GA,HD_dialyzer_Theranova 400,HD_dialyzer_fx5,HD_dialyzer_polyflux 14,HD_dialyzer_polyflux 14H,HD_dialyzer_polyflux 14L,HD_dialyzer_polyflux 14L.1,HD_dialyzer_polyflux 14S,HD_dialyzer_polyflux 170H,HD_dialyzer_polyflux 17L,HD_dialyzer_polyflux 17S,HD_dialyzer_polyflux 6H,HD_dialyzer_polyflux 8L,HD_dialyzer_polyflux s,HD_ntime_target,VS_sbp_init_in_pre_hd,VS_dbp_init_in_pre_hd,VS_sbp_min_in_pre_hd,VS_dbp_min_in_pre_hd,VS_sbp_max_in_pre_hd,VS_dbp_max_in_pre_hd,pre_hd,VS_sbp_target,VS_dbp_target,VS_sbp_target_class,VS_dbp_target_class
63,7713592201612190,Validation,-0.792345,-0.379986,1,0.550147,180,-1.287793,-1.428803,-0.851625,0.413566,1,0,-0.792345,-0.379986,1.044813,0.280949,-0.948728,0.514943,1,0,0,0.0,0.0,1.0,-0.181199,-0.475888,-0.931784,-0.762258,-0.928799,0.037681,-0.661833,-2.222634,-0.287181,0.295207,-1.705129,0.157974,-1.05028,-0.106702,0.655606,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.799268,-0.420089,0.113699,-2.318594,-1.508407,-0.420089,0.113699,1,-0.866796,-0.732617,3,2
64,7713592201612190,Validation,-0.792345,-0.379986,1,0.550147,180,-0.799268,-1.276686,-0.851625,0.413566,1,0,-0.866796,-0.732617,1.276455,0.280949,-0.948728,0.514943,1,0,0,0.0,0.0,1.0,-0.181199,-0.475888,-0.931784,-0.762258,-0.928799,0.037681,-0.661833,-2.222634,-0.287181,0.295207,-1.705129,0.157974,-1.05028,-0.106702,0.655606,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.473584,-0.420089,0.113699,-2.318594,-1.508407,-0.420089,0.113699,1,-0.941247,-0.521038,3,2
65,7713592201612190,Validation,-0.792345,-0.379986,1,0.550147,180,-0.473584,-1.023157,-0.851625,0.413566,1,0,-0.941247,-0.521038,1.218545,0.280949,-0.948728,0.514943,1,0,0,0.0,0.0,1.0,-0.181199,-0.475888,-0.931784,-0.762258,-0.928799,0.037681,-0.661833,-2.222634,-0.287181,0.295207,-1.705129,0.157974,-1.05028,-0.106702,0.655606,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.1479,-0.420089,0.113699,-2.318594,-1.508407,-0.420089,0.113699,1,-1.42518,-1.155776,1,1
66,7713592201612190,Validation,-0.792345,-0.379986,1,0.550147,180,-0.1479,-0.668216,-0.851625,0.413566,1,0,-1.42518,-1.155776,1.392277,0.280949,-0.948728,0.514943,1,0,0,0.0,0.0,1.0,-0.181199,-0.475888,-0.931784,-0.762258,-0.928799,0.037681,-0.661833,-2.222634,-0.287181,0.295207,-1.705129,0.157974,-1.05028,-0.106702,0.655606,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.473584,-0.420089,0.113699,-2.318594,-1.508407,-0.420089,0.113699,1,-1.536856,-0.521038,3,3
67,7713592201612190,Validation,-0.792345,-0.379986,1,0.550147,180,-0.473584,-0.414687,-0.851625,0.413566,1,0,-1.536856,-0.521038,1.623919,0.280949,-0.948728,0.21294,1,0,0,0.0,0.0,1.0,-0.181199,-0.475888,-0.931784,-0.762258,-0.928799,0.037681,-0.661833,-2.222634,-0.287181,0.295207,-1.705129,0.157974,-1.05028,-0.106702,0.655606,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.473584,-0.420089,0.113699,-2.318594,-1.508407,-0.420089,0.113699,1,-0.82957,0.536857,4,3


In [164]:
df = df.drop('ID_class', axis=1)

In [165]:
df.head()

Unnamed: 0,ID_hd,VS_sbp_init,VS_dbp_init,Pt_sex,Pt_age,HD_duration,HD_ntime,HD_ctime,HD_prewt,HD_uf,HD_hep,HD_fut,VS_sbp,VS_dbp,VS_hr,VS_bt,VS_bfr,VS_uft,CL_adm,CL_dm,CL_htn,CL_cad,CL_donor,CL_recipient,Lab_wbc,Lab_hb,Lab_plt,Lab_chol,Lab_alb,Lab_glu,Lab_ca,Lab_phos,Lab_ua,Lab_bun,Lab_scr,Lab_na,Lab_k,Lab_cl,Lab_co2,MED_bb,MED_ccb,MED_aceiarb,MED_spirono,MED_lasix,MED_statin,MED_minox,MED_aspirin,MED_plavix,MED_warfarin,MED_oha,MED_insulin,MED_allop,MED_febuxo,MED_epo,MED_pbindca,MED_pbindnoca,HD_type_HD,HD_type_HDF,HD_type_HDR,HD_type_HF,HD_type_HFR,HD_type_Hemoperfusion,HD_type_SUF,HD_acces_AVF,HD_acces_AVG,HD_acces_FVC,HD_acces_IJC,HD_acces_Others,HD_acces_Perm,HD_prim_blood,HD_prim_half,HD_prim_with,HD_prim_without,HD_dialysate_APS-15U,HD_dialysate_B DEX 0.15%,HD_dialysate_B Dex 0.15%,HD_dialysate_B dex 0.1%,HD_dialysate_B dex 0.1%.1,HD_dialysate_B dex 0.1%.2,HD_dialysate_B dex 0.15%,HD_dialysate_B dex 0.15%.1,HD_dialysate_B1/5L,HD_dialysate_B1/5L.1,HD_dialysate_Blue,HD_dialysate_Fx5,HD_dialysate_NC1485,HD_dialyzer_APS-15U,HD_dialyzer_APS-15u,HD_dialyzer_APS-21U,HD_dialyzer_Adsorba,HD_dialyzer_B dex 0.15%,HD_dialyzer_B1/5L,HD_dialyzer_BLD816SD,HD_dialyzer_BLD819SD,HD_dialyzer_BLS 812G,HD_dialyzer_BLS12G,HD_dialyzer_BLS14SD,HD_dialyzer_BLS812G,HD_dialyzer_BLS812SD,HD_dialyzer_BLS814SD,HD_dialyzer_BLS816SD,HD_dialyzer_BLS819,HD_dialyzer_BLS819SD,HD_dialyzer_F4 HPS,HD_dialyzer_F5 HPS,HD_dialyzer_F6 HPS,HD_dialyzer_FB130T,HD_dialyzer_FX,HD_dialyzer_FX paed,HD_dialyzer_FX40,HD_dialyzer_FX5,HD_dialyzer_FX50,HD_dialyzer_FX8,HD_dialyzer_FX80,HD_dialyzer_Fx5,HD_dialyzer_NC1485,HD_dialyzer_PHF0714,HD_dialyzer_Polyflux 14L,HD_dialyzer_Polyflux 6H,HD_dialyzer_Rexeed-13LX,HD_dialyzer_Rexeed-18LX,HD_dialyzer_SG30,HD_dialyzer_Sureflux 130E-GA,HD_dialyzer_Theranova 400,HD_dialyzer_fx5,HD_dialyzer_polyflux 14,HD_dialyzer_polyflux 14H,HD_dialyzer_polyflux 14L,HD_dialyzer_polyflux 14L.1,HD_dialyzer_polyflux 14S,HD_dialyzer_polyflux 170H,HD_dialyzer_polyflux 17L,HD_dialyzer_polyflux 17S,HD_dialyzer_polyflux 6H,HD_dialyzer_polyflux 8L,HD_dialyzer_polyflux s,HD_ntime_target,VS_sbp_init_in_pre_hd,VS_dbp_init_in_pre_hd,VS_sbp_min_in_pre_hd,VS_dbp_min_in_pre_hd,VS_sbp_max_in_pre_hd,VS_dbp_max_in_pre_hd,pre_hd,VS_sbp_target,VS_dbp_target,VS_sbp_target_class,VS_dbp_target_class
63,7713592201612190,-0.792345,-0.379986,1,0.550147,180,-1.287793,-1.428803,-0.851625,0.413566,1,0,-0.792345,-0.379986,1.044813,0.280949,-0.948728,0.514943,1,0,0,0.0,0.0,1.0,-0.181199,-0.475888,-0.931784,-0.762258,-0.928799,0.037681,-0.661833,-2.222634,-0.287181,0.295207,-1.705129,0.157974,-1.05028,-0.106702,0.655606,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.799268,-0.420089,0.113699,-2.318594,-1.508407,-0.420089,0.113699,1,-0.866796,-0.732617,3,2
64,7713592201612190,-0.792345,-0.379986,1,0.550147,180,-0.799268,-1.276686,-0.851625,0.413566,1,0,-0.866796,-0.732617,1.276455,0.280949,-0.948728,0.514943,1,0,0,0.0,0.0,1.0,-0.181199,-0.475888,-0.931784,-0.762258,-0.928799,0.037681,-0.661833,-2.222634,-0.287181,0.295207,-1.705129,0.157974,-1.05028,-0.106702,0.655606,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.473584,-0.420089,0.113699,-2.318594,-1.508407,-0.420089,0.113699,1,-0.941247,-0.521038,3,2
65,7713592201612190,-0.792345,-0.379986,1,0.550147,180,-0.473584,-1.023157,-0.851625,0.413566,1,0,-0.941247,-0.521038,1.218545,0.280949,-0.948728,0.514943,1,0,0,0.0,0.0,1.0,-0.181199,-0.475888,-0.931784,-0.762258,-0.928799,0.037681,-0.661833,-2.222634,-0.287181,0.295207,-1.705129,0.157974,-1.05028,-0.106702,0.655606,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.1479,-0.420089,0.113699,-2.318594,-1.508407,-0.420089,0.113699,1,-1.42518,-1.155776,1,1
66,7713592201612190,-0.792345,-0.379986,1,0.550147,180,-0.1479,-0.668216,-0.851625,0.413566,1,0,-1.42518,-1.155776,1.392277,0.280949,-0.948728,0.514943,1,0,0,0.0,0.0,1.0,-0.181199,-0.475888,-0.931784,-0.762258,-0.928799,0.037681,-0.661833,-2.222634,-0.287181,0.295207,-1.705129,0.157974,-1.05028,-0.106702,0.655606,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.473584,-0.420089,0.113699,-2.318594,-1.508407,-0.420089,0.113699,1,-1.536856,-0.521038,3,3
67,7713592201612190,-0.792345,-0.379986,1,0.550147,180,-0.473584,-0.414687,-0.851625,0.413566,1,0,-1.536856,-0.521038,1.623919,0.280949,-0.948728,0.21294,1,0,0,0.0,0.0,1.0,-0.181199,-0.475888,-0.931784,-0.762258,-0.928799,0.037681,-0.661833,-2.222634,-0.287181,0.295207,-1.705129,0.157974,-1.05028,-0.106702,0.655606,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.473584,-0.420089,0.113699,-2.318594,-1.508407,-0.420089,0.113699,1,-0.82957,0.536857,4,3


In [166]:
df.shape

(160783, 148)

In [167]:
def convert_to_sequence(total_seq, df):
    grouped = df.sort_values(['ID_hd', 'HD_ctime'], ascending=[True, True]).groupby('ID_hd')
    unique = df['ID_hd'].unique()
    for id_ in unique:
        seq = grouped.get_group(id_)  # dataframe type
        seq.drop('ID_hd', axis=1, inplace=True)
        total_seq.append(seq.values.tolist())

    total_seq = np.array([np.array(i) for i in total_seq])
    return np.asarray(total_seq)

In [168]:
total_seq = []
refined_df = convert_to_sequence(total_seq, df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [171]:
len(refined_df), len(refined_df[0]), len(refined_df[0][0])

(26179, 10, 147)

In [174]:
refined_df

array([array([[-0.79234465, -0.37998553,  1.        , ..., -0.73261736,
         3.        ,  2.        ],
       [-0.79234465, -0.37998553,  1.        , ..., -0.52103826,
         3.        ,  2.        ],
       [-0.79234465, -0.37998553,  1.        , ..., -1.15577555,
         1.        ,  1.        ],
       ...,
       [-0.79234465, -0.37998553,  1.        , ..., -1.08524919,
         0.        ,  1.        ],
       [-0.79234465, -0.37998553,  1.        , ..., -1.36735465,
         4.        ,  2.        ],
       [-0.79234465, -0.37998553,  1.        , ..., -1.29682828,
         1.        ,  2.        ]]),
       array([[ 1.66454416,  0.53685722,  0.        , ..., -0.4505119 ,
         0.        ,  0.        ],
       [ 1.66454416,  0.53685722,  0.        , ..., -0.4505119 ,
         3.        ,  2.        ],
       [ 1.66454416,  0.53685722,  0.        , ..., -0.09788007,
         4.        ,  2.        ],
       ...,
       [ 1.66454416,  0.53685722,  0.        , ...,  0.18422

In [None]:
class HemodialysisDataset():
    """Hemodialysis dataset from SNU"""

    def __init__(self, root_dir, csv_files, model_type, save=True):
        self.hemodialysis_frame = pd.DataFrame()
        self.files = csv_files
        self.root_dir = root_dir
        # self.training_type = training_type
        self.model_type = model_type
        self.total_seq = []
        self.columns = []
        self.mean_for_normalize = {}
        self.std_for_normalize = {}
        # self.init_value = pd.DataFrame()
        self.sbp_diff = -20
        self.dbp_diff = -10
        self.seed= 212
        self._init_dataset(save)

    def __len__(self):
        return len(self.hemodialysis_frame)

    def __getitem__(self, idx):
        return self.hemodialysis_frame[idx]

    def _init_dataset(self, save):
        for f in self.files:
            tmp = pd.read_csv(os.path.join(self.root_dir, f), header=0)
            self.hemodialysis_frame = pd.concat([self.hemodialysis_frame, tmp])
        self.refine_dataset()
        self.normalize()
        # self.hemodialysis_frame = self.hemodialysis_frame.loc[self.hemodialysis_frame.ID_class == self.training_type]
        if self.model_type == 'MLP':
            self.concat_history()
        else:
            self.make_sequence()
            self.add_pre_hemodialysis()
            self.order_target_column()
        self.split_train_test()
        self.columns = list(filter(lambda x: x not in ['ID_class', 'ID_hd'], self.hemodialysis_frame.columns))
        if save:
            print("Saving...")
            for type in ['Train', 'Validation', 'Test']:
                df = self.hemodialysis_frame.loc[self.hemodialysis_frame.ID_class == type].copy()
                df = df.drop('ID_class', axis=1)
                if self.model_type == 'MLP':
                    df = df.drop('ID_hd', axis=1)
                    df = np.array(df).astype('float')
                else:
                    df = self.convert_to_sequence(df)
                torch.save(df, '{}_{}.pt'.format(self.model_type, type))
        if not save:
            self.hemodialysis_frame.drop('ID_class', axis=1, inplace=True)
            if self.model_type == 'MLP':
                np.array(self.hemodialysis_frame).astype('float')
            else:
                self.hemodialysis_frame = self.convert_to_sequence(self.hemodialysis_frame)


    def split_train_test(self):
        print("Splitting...")
        np.random.seed(self.seed)
        key = self.hemodialysis_frame['ID_hd'].unique()
        idx = np.random.permutation(range(len(key)))
        train_split, val_split = int(np.floor(0.7 * len(key))), int(np.floor(0.8 * len(key)))
        train_idx, val_idx, test_idx = idx[:train_split], idx[train_split:val_split], idx[val_split:]
        training_type = pd.DataFrame(data=key, columns=['ID_hd'])
        training_type['ID_class'] = 0; training_type.loc[train_idx, 'ID_class'] = 'Train'; training_type.loc[val_idx, 'ID_class'] = 'Validation'; training_type.loc[test_idx, 'ID_class'] = 'Test'
        self.hemodialysis_frame = training_type.merge(self.hemodialysis_frame, how='inner', left_on=["ID_hd"], right_on=["ID_hd"])

    def refine_dataset(self):
        self.hemodialysis_frame['Pt_id'] = [x.replace('*', '') for x in self.hemodialysis_frame['Pt_id']]
        self.hemodialysis_frame['Pt_id'] = self.hemodialysis_frame['Pt_id'].astype(str).astype(int)
        self.hemodialysis_frame['ID_hd'] = [x.replace('*', '') for x in self.hemodialysis_frame['ID_hd']]
        self.hemodialysis_frame['ID_hd'] = self.hemodialysis_frame['ID_hd'].astype(str).astype(int)
        categorical = ['HD_type','HD_acces','HD_prim','HD_dialysate','HD_dialyzer']
        drop_columns = ['VS_rr', 'Time']
        self.hemodialysis_frame.drop(labels=drop_columns, inplace=True, axis=1)
        timestamp = pd.to_datetime(self.hemodialysis_frame['HD_duration']).dt
        self.hemodialysis_frame['HD_duration'] = timestamp.hour * 60 + timestamp.minute
        self.hemodialysis_frame.fillna(method='ffill', inplace=True)
        self.hemodialysis_frame['Pt_sex'] = self.hemodialysis_frame['Pt_sex'].replace({'M':0, 'F':1})
        self.hemodialysis_frame = pd.get_dummies(self.hemodialysis_frame, columns=categorical, prefix=categorical)
        self.hemodialysis_frame = self.hemodialysis_frame.loc[(self.hemodialysis_frame.VS_sbp > 0) & (self.hemodialysis_frame.VS_dbp > 0)]

    def concat_history(self):
        print("Concating...")
        self.hemodialysis_frame.drop(labels=['Pt_id'], axis=1, inplace=True)
        self.hemodialysis_frame['time'] = [dt.datetime.strptime(x[10:], '%Y%m%d*%H%M') for x in self.hemodialysis_frame['ID_timeline']]
        self.hemodialysis_frame['prev_time'] = self.hemodialysis_frame.apply(lambda x: x.time - dt.timedelta(minutes=x.HD_ntime_raw), axis=1)
        prev_data = self.hemodialysis_frame.copy()
        target_data = self.hemodialysis_frame.loc[self.hemodialysis_frame.HD_ctime_raw > 0][['ID_hd','prev_time', 'HD_ntime', 'VS_sbp', 'VS_dbp']]
        init_data = self.hemodialysis_frame.loc[self.hemodialysis_frame.HD_ctime_raw == 0][['ID_hd', 'VS_sbp', 'VS_dbp']]
        self.hemodialysis_frame = self.hemodialysis_frame.merge(prev_data, how='inner', left_on=['ID_hd', 'prev_time'], right_on=['ID_hd', 'time'], suffixes=('', '_prev'))
        self.hemodialysis_frame = self.hemodialysis_frame.merge(target_data, how='inner', left_on=['ID_hd', 'time'], right_on=['ID_hd', 'prev_time'], suffixes=('', '_target'))
        self.hemodialysis_frame = init_data.merge(self.hemodialysis_frame, how='inner', on=['ID_hd'], suffixes=('_init', ''))
        drop_columns_for_learning = ['ID_timeline', 'HD_ctime_raw', 'HD_ntime_raw', 'time', 'prev_time', 'Pt_sex_prev', 'Pt_age_prev', 'ID_timeline_prev', 'time_prev', 'prev_time_prev', 'prev_time_target']
        self.hemodialysis_frame.drop(labels=drop_columns_for_learning, axis=1, inplace=True)
        self.add_target_class()
        # self.init_value = self.hemodialysis_frame[['VS_sbp_init', 'VS_dbp_init']]

    def make_sequence(self):
        self.hemodialysis_frame['time'] = [dt.datetime.strptime(x[10:], '%Y%m%d*%H%M') for x in self.hemodialysis_frame['ID_timeline']]
        self.hemodialysis_frame['prev_time'] = self.hemodialysis_frame.apply(lambda x: x.time - dt.timedelta(minutes=x.HD_ntime_raw), axis=1)
        target_data = self.hemodialysis_frame.loc[self.hemodialysis_frame.HD_ctime_raw > 0][['ID_hd', 'prev_time', 'HD_ntime', 'VS_sbp', 'VS_dbp']]
        init_data = self.hemodialysis_frame.loc[self.hemodialysis_frame.HD_ctime_raw == 0][['ID_hd', 'VS_sbp', 'VS_dbp']]
        self.hemodialysis_frame = self.hemodialysis_frame.merge(target_data, how='inner', left_on=['ID_hd', 'time'], right_on=['ID_hd', 'prev_time'], suffixes=('', '_target'))
        self.hemodialysis_frame = init_data.merge(self.hemodialysis_frame, how='inner', on=['ID_hd'], suffixes=('_init', ''))

        drop_columns = ['ID_timeline', 'HD_ntime_raw', 'time', 'prev_time', 'prev_time_target']
        self.hemodialysis_frame.drop(labels=drop_columns, axis=1, inplace=True)
        self.add_target_class()

    def normalize(self):
        print('Normalizing...')
        self.hemodialysis_frame['HD_ctime_raw'] = self.hemodialysis_frame['HD_ctime']
        self.hemodialysis_frame['HD_ntime_raw'] = self.hemodialysis_frame['HD_ntime']
        numerical_col = ['Pt_age', 'HD_ntime', 'HD_ctime', 'HD_prewt', 'HD_uf', 'VS_sbp', 'VS_dbp', 'VS_hr', 'VS_bt', 'VS_bfr', 'VS_uft', 'Lab_wbc', 'Lab_hb', 'Lab_plt', 'Lab_chol', 'Lab_alb', 'Lab_glu', 'Lab_ca', 'Lab_phos', 'Lab_ua', 'Lab_bun', 'Lab_scr', 'Lab_na', 'Lab_k', 'Lab_cl', 'Lab_co2']
        for col in numerical_col:
            self.mean_for_normalize[col] = self.hemodialysis_frame[col].mean()
            self.std_for_normalize[col] = self.hemodialysis_frame[col].std()
            if self.std_for_normalize[col] > 0:
                self.hemodialysis_frame[col] = (self.hemodialysis_frame[col] - self.mean_for_normalize[col]) / self.std_for_normalize[col]
            else:
                self.hemodialysis_frame[col] = 0

    def add_target_class(self):
        def eval_target(diff, type):
            if type == 'sbp':
                if diff < -20 :
                    return 0
                if diff < -10 :
                    return 1
                if diff < -5 :
                    return 2
                if diff < 5 :
                    return 3
                else:
                    return 4
            if type == 'dbp':
                if diff < -10:
                    return 0
                if diff < -5:
                    return 1
                if diff < 5:
                    return 2
                else:
                    return 3

        self.hemodialysis_frame['VS_sbp_target_class'] = ((self.hemodialysis_frame['VS_sbp_target'] - self.hemodialysis_frame['VS_sbp']) * self.std_for_normalize['VS_sbp']).apply(lambda x: eval_target(x,'sbp'))
        self.hemodialysis_frame['VS_dbp_target_class'] = ((self.hemodialysis_frame['VS_dbp_target'] - self.hemodialysis_frame['VS_dbp']) * self.std_for_normalize['VS_dbp']).apply(lambda x: eval_target(x,'dbp'))

    def add_pre_hemodialysis(self):
        def merge_pre(frame):
            min_bp = frame.groupby(['Pt_id', 'ID_hd'])['VS_sbp', 'VS_dbp'].min().reset_index()
            max_bp = frame.groupby(['Pt_id', 'ID_hd'])['VS_sbp', 'VS_dbp'].max().reset_index()
            init_bp = frame.loc[frame.HD_ctime_raw == 0][['Pt_id', 'ID_hd', 'VS_sbp', 'VS_dbp']]
            init_bp.columns = ['Pt_id', 'ID_hd', 'VS_sbp_init_in_pre_hd', 'VS_dbp_init_in_pre_hd']
            pre_hd = min_bp.merge(max_bp, how='inner', on=['Pt_id', 'ID_hd'], suffixes=('_min_in_pre_hd', '_max_in_pre_hd'))
            pre_hd = init_bp.merge(pre_hd, how='inner', on=['Pt_id', 'ID_hd'])
            pre_hd['rank'] = pre_hd.sort_values(['Pt_id', 'ID_hd'], ascending=[True, True]).groupby(['Pt_id']).cumcount() + 1
            del min_bp, max_bp, init_bp
            frame = frame.merge(pre_hd[['Pt_id', 'ID_hd', 'rank']], on=['Pt_id', 'ID_hd'], how='inner')
            frame['rank'] -= 1
            frame = frame.merge(pre_hd, on=['Pt_id', 'rank'], how='left', suffixes=('', '_pre'))
            frame['pre_hd'] = [0 if np.isnan(x) else 1 for x in frame['VS_sbp_min_in_pre_hd']]
            frame.drop(labels=['Pt_id', 'rank', 'ID_hd_pre', 'HD_ctime_raw'], axis=1, inplace=True)
            frame.fillna(0.0, inplace=True)
            return frame
        self.hemodialysis_frame = merge_pre(self.hemodialysis_frame)

    def order_target_column(self):
        target_columns = ['VS_sbp_target', 'VS_dbp_target', 'VS_sbp_target_class', 'VS_dbp_target_class']
        input_columns = self.hemodialysis_frame.columns[
            [False if x in target_columns else True for x in self.hemodialysis_frame.columns]].to_list()
        self.hemodialysis_frame = self.hemodialysis_frame[input_columns + target_columns]

    def convert_to_sequence(self, df):
        grouped = df.sort_values(['ID_hd', 'HD_ctime'], ascending=[True,True]).groupby('ID_hd')
        unique = df['ID_hd'].unique()
        for id_ in unique:
            seq = grouped.get_group(id_)  # dataframe type
            seq.drop('ID_hd', axis=1, inplace=True)
            self.total_seq.append(seq.values.tolist())

        self.total_seq = np.array([np.array(i) for i in self.total_seq])
        return np.asarray(self.total_seq)
