In [1]:
from __future__ import print_function
from matplotlib import pyplot
pyplot.style.use('dark_background')
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>table.dataframe {font-size:60%;line-height:100%; padding:0px; margin:0px;}</style>"))
display(HTML("<style>table.dataframe td {padding:3px; margin:0px;}</style>"))
display(HTML("<style>table.dataframe tr {padding:3px; margin:0px;}</style>"))
display(HTML("<style>table.dataframe th {padding:3px; margin:0px;}</style>"))
import yaml
#
from mimic3benchmark.mimic3csv import *
from mimic3benchmark.preprocessing import add_hcup_ccs_2015_groups, make_phenotype_label_matrix
from mimic3benchmark.util import *
from OF_util import write_csv_file

In [None]:
class Args(object):
    mimic3_path = ""

args = Args()
args.mimic3_path = "../../../mimic-iii-clinical-database-1.4"

patients = read_patients_table_with_expire_flag(args.mimic3_path)
display (patients.head())

In [None]:
admits = read_admissions_table(args.mimic3_path)
display (admits)

In [None]:
stays = read_icustays_table(args.mimic3_path)
display(stays)

In [None]:
stays = merge_on_subject_admission(stays, admits)
stays = merge_on_subject(stays, patients)
display(stays)

In [None]:
stays = add_age_to_icustays(stays)
stays = add_inunit_mortality_to_icustays(stays)
stays = add_inhospital_mortality_to_icustays(stays)
# includes age and death in hospital:
display(stays)

In [None]:
# filter only age above 18 years old
stays = filter_icustays_on_age(stays)
display(stays)

In [None]:
diagnoses = read_icd_diagnoses_table(args.mimic3_path)
# display(diagnoses)
# filter diagnosis by SEQ_NUM == 1
diagnoses = diagnoses.loc[(diagnoses.SEQ_NUM == 1)]
# display(diagnoses)
# merge with stays:
stays = stays.merge(diagnoses[['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']], how='inner',
                           left_on=['SUBJECT_ID', 'HADM_ID'], right_on=['SUBJECT_ID', 'HADM_ID'])
print(stays.columns)

In [None]:
stays = stays.sort_values(['SUBJECT_ID','ADMITTIME','INTIME'])
stays['Num_Prev_Hos_Adm'] = stays.groupby('SUBJECT_ID')['SUBJECT_ID'].rank(method='first').subtract(1).astype(int)

In [None]:
stays['Hos_LOS'] = (stays['DISCHTIME'] - stays['ADMITTIME']) / pd.to_timedelta(1, unit='D')
stays = stays[stays['MORTALITY_INHOSPITAL']!=1]
# stays3 = stays[0:1000]
stays3 = stays
stays3['days_diff'] = stays3.groupby('SUBJECT_ID').apply(lambda x: x.ADMITTIME-x.DISCHTIME.shift()).reset_index(level=0,drop=True)
stays3['days_diff_positive'] = (stays3['days_diff']> pd.Timedelta(days=0)) & (stays3['days_diff']<= pd.Timedelta(days=30))
stays3['days_diff_negative'] = stays3['days_diff']> pd.Timedelta(days=30)
stays3['dod_out_hos_in30days'] = stays3['DOD']-stays3['DISCHTIME'] <= pd.Timedelta(days=30)
stays3['dod_out_hos_after30days'] = stays3['DOD']-stays3['DISCHTIME'] > pd.Timedelta(days=30)
stays3['expire_flag_is_false'] = stays3['EXPIRE_FLAG'] == False
stays3['positive'] = stays3['dod_out_hos_in30days'] | stays3['days_diff_positive']
stays3['negative'] = stays3['dod_out_hos_after30days'] | stays3['days_diff_negative'] | stays3['expire_flag_is_false']
# stays3['days_diff'] = stays4['days_diff']
stays4 = stays3[['SUBJECT_ID','HADM_ID','ICUSTAY_ID','ADMITTIME','DISCHTIME','days_diff','days_diff_positive',
                 'days_diff_negative','DOD', 'dod_out_hos_in30days','dod_out_hos_after30days','positive','negative', 'expire_flag_is_false']]
display(stays4)

In [None]:
stays_all = stays3[['SUBJECT_ID','HADM_ID','ICUSTAY_ID','INSURANCE','RELIGION', 'MARITAL_STATUS', 'ETHNICITY',
                 'GENDER', 'DOB', 'DOD', 'AGE', 'MORTALITY_INHOSPITAL', 'DEATHTIME', 'DBSOURCE', 'INTIME', 'OUTTIME',
                 'LOS', 'ADMITTIME', 'DISCHTIME', 'Hos_LOS', 'ICD9_CODE', 'Num_Prev_Hos_Adm',
                 'ADMITTIME','DISCHTIME','days_diff','days_diff_positive',
                 'days_diff_negative','DOD', 'dod_out_hos_in30days','dod_out_hos_after30days','positive','negative', 'expire_flag_is_false']]

In [None]:
stays_all_drop = stays_all.drop(stays_all[stays_all['positive']==stays_all['negative']].index)
stays_all_drop = stays_all_drop[['SUBJECT_ID','HADM_ID','ICUSTAY_ID','LOS','Hos_LOS','Num_Prev_Hos_Adm','positive','negative']]
display(stays_all_drop.head())
print (stays_all_drop.shape)

In [None]:
write_csv_file(stays_all_drop,"stay_all_drop_sampled.csv")

In [None]:
stays_all_drop_pos = stays_all_drop[stays_all_drop['positive']==True]
stays_all_drop_neg = stays_all_drop[stays_all_drop['negative']==True]

In [None]:
print (len(stays_all_drop_pos))
num_rows_to_keep = len(stays_all_drop_pos)
stays_all_drop_neg_sampled = stays_all_drop_neg.sample(num_rows_to_keep)

In [None]:
stays_all_drop = pd.concat([stays_all_drop_pos, stays_all_drop_neg_sampled])
write_csv_file(stays_all_drop, csv_file="stays_all_drop_sampled.csv", root="./")

In [None]:
print (len(stays_all[stays_all['positive']==stays_all['negative']]))
print (len(stays_all[(stays_all['positive']==stays_all['negative']) & (stays_all['positive']==True) ]))
print (len(stays_all[(stays_all['positive']==stays_all['negative']) & (stays_all['positive']==False) ]))
print (len(stays_all[ (stays_all['positive']==True) ]))
print (len(stays_all[ (stays_all['positive']==False) ]))
# stays_all.to_csv('stay_all_results_olina.csv')

In [None]:
# print (stays3[stays3['days_diff']< pd.Timedelta(days=0)].count())

In [None]:
# print (stays3['positive'].value_counts())
# print (stays3['negative'].value_counts())
# print (stays3[(stays3['positive']==True) | (stays3['negative']==True)].count())