In [347]:
import pandas as pd
pd.options.mode.chained_assignment = None

import numpy as np
import matplotlib.pyplot as plt

In [348]:
# load in data
data_path='../data_extraction/data/'

cohort = pd.read_table(data_path+'cohort.tsv')
maps = pd.read_csv(data_path+'map.csv')
creatinine = pd.read_csv(data_path+'creatinine.csv')
admission_creatinine = pd.read_csv(data_path+'admission_creatinine.csv')
urine = pd.read_csv(data_path+'map.csv')

In [349]:
# numerical variable summaries
print cohort[['age','los','max_lactate','vaso_frac']].describe()

               age          los    max_lactate    vaso_frac
count  5342.000000  5342.000000    4485.000000  5342.000000
mean     71.703842    11.416165     226.160700     0.133613
std      48.474031    10.331876   14931.965231     0.214067
min      18.138720     3.000300       0.500000     0.000000
25%      52.494296     4.790000       1.500000     0.000000
50%      65.322301     7.903400       2.200000     0.006947
75%      76.984900    14.118000       3.800000     0.196396
max     309.777981   173.072500  999999.000000     0.995077


In [350]:
# gender summary
print pd.value_counts(cohort['gender'].values,normalize=True)

M    0.510857
F    0.489143
dtype: float64


In [351]:
# ethnicity summary
eth = cohort['ethnicity'].astype('category')

# grouping based on the standards at: http://grants.nih.gov/grants/guide/notice-files/NOT-OD-15-089.html
eth = eth.cat.add_categories(['HISPANIC/LATINO','MULTI/OTHER','UNKNOWN'])

eth[np.array([('HISPANIC' in i or
               'PORTUGUESE' in i) for i in eth],dtype=bool)]='HISPANIC/LATINO'
eth[np.array([('ASIAN' in i) for i in eth],dtype=bool)]='ASIAN'
eth[np.array([('BLACK' in i or
               'AFRICAN' in i) for i in eth],dtype=bool)]='BLACK/AFRICAN AMERICAN'
eth[np.array([('WHITE' in i or
               'MIDDLE EAST' in i) for i in eth],dtype=bool)]='WHITE'
eth[np.array([('MULTI' in i or
               'OTHER' in i) for i in eth],dtype=bool)]='MULTI/OTHER'
eth[np.array([('DECLINE' in i or
               'UNABLE' in i or
               'UNKNOWN' in i) for i in eth],dtype=bool)]='UNKNOWN'

eth = eth.cat.remove_unused_categories()

print pd.value_counts(eth.values,normalize=True,sort=False)

AMERICAN INDIAN/ALASKA NATIVE    0.000374
ASIAN                            0.019656
BLACK/AFRICAN AMERICAN           0.064770
WHITE                            0.727256
HISPANIC/LATINO                  0.031636
MULTI/OTHER                      0.027331
UNKNOWN                          0.128978
dtype: float64


In [352]:
# convert raw MAP readings to MAP features
interval = 60

# average MAP for every hour
maps.set_index('icustay_id')
maps['hour'] = pd.Series((maps.min_from_intime/interval).astype(int), index=maps.index)

# get means for every hour
mean_maps = maps.groupby(['icustay_id', 'hour'])['value'].mean()
mean_maps = mean_maps.to_frame().reset_index().set_index(['icustay_id'])

# interpolate MAPs for missing values
min_hours = mean_maps.groupby([mean_maps.index.get_level_values(0)])['hour'].min()
max_hours = mean_maps.groupby([mean_maps.index.get_level_values(0)])['hour'].max()

interp_index = []
for this_icustay in min_hours.index:
    min_hour = min_hours.loc[this_icustay]
    max_hour = max_hours.loc[this_icustay]
    interp_index += [(this_icustay, hour) for hour in np.arange(min_hour,max_hour+1)]

mean_maps = mean_maps.set_index(['hour'],append=True)
interp_mean_maps = mean_maps.reindex(pd.MultiIndex.from_tuples(interp_index,names=['icustay_id','hour']))
interp_mean_maps = interp_mean_maps['value'].interpolate(method='linear')
interp_mean_maps = interp_mean_maps.to_frame().reset_index()

# get percent of hours missing a MAP value
missing_map = len(interp_mean_maps.index) - len(mean_maps.index)

frac_missing = missing_map/float(len(interp_mean_maps.index))
print "Fraction of hours missing MAP values:", frac_missing

# get minimum MAP value per patient
min_ind = interp_mean_maps.groupby('icustay_id')['value'].idxmin(skipna=True)
min_maps = interp_mean_maps.loc[min_ind]

# bin MAP values for first 72 hours
map_72 = interp_mean_maps.loc[(0<interp_mean_maps['hour']) & (interp_mean_maps['hour']<72)]

map_cutoffs = np.append(np.arange(30,100,10),200)
map_72['bin'] = pd.cut(map_72['value'], map_cutoffs)
min_maps['bin'] = pd.cut(min_maps['value'], map_cutoffs)

map_fracs = map_72.groupby('icustay_id')['bin'].value_counts(normalize=True)

# reformat features to be used in final dataset
min_maps = min_maps.set_index('icustay_id')
map_fracs = map_fracs.to_frame()

Fraction of hours missing MAP values: 0.0729135812521


In [353]:
# use creatinine measurements to determine AKI onset
creatinine = creatinine.dropna()

# only consider creatinine measurements after admission
creatinine = creatinine.loc[creatinine['min_from_intime']>0]

# calculate time and first creatinine measurement from admission
creatinine = creatinine.merge(admission_creatinine,suffixes=('','_ref'),on=['icustay_id'],how='left')

# RIFLE Creatinine Criteria: Creatinine doubles
creatinine['I'] = creatinine['value']>=2*creatinine['value_ref']
creatinine['F'] = creatinine['value']>=3*creatinine['value_ref']

# Group creatinine measurements by icustay_id
icustay_creat = creatinine.groupby(['icustay_id'])

# Find the first time the patient meets the RIFLE creatinine criteria 
i_creat_ind = icustay_creat['I'].apply(lambda x: x[x].index[0] if len(x[x])>0 else None)
f_creat_ind = icustay_creat['F'].apply(lambda x: x[x].index[0] if len(x[x])>0 else None)

i_creat = creatinine[['icustay_id','min_from_intime','value']].ix[i_creat_ind.dropna().tolist()]
i_creat = i_creat.set_index('icustay_id')
i_creat.rename(columns={'value':'i_val','min_from_intime':'i_time'},inplace=True)

f_creat = creatinine[['icustay_id','min_from_intime','value']].ix[f_creat_ind.dropna().tolist()]
f_creat = f_creat.set_index('icustay_id')
f_creat.rename(columns={'value':'f_val','min_from_intime':'f_time'},inplace=True)

# also get max creatinine value within time window
max_creat = creatinine.loc[icustay_creat['value'].idxmax(skipna=True)]

# nicely summarize creatinine data
d = {'icustay_id':max_creat['icustay_id'],
     'ref_value':max_creat['value_ref'],
     'ref_time':max_creat['min_from_intime_ref'],
     'max_value':max_creat['value'],
     'max_time':max_creat['min_from_intime']}
creat_summary = pd.DataFrame(d)
creat_summary = creat_summary.set_index('icustay_id')

creat_summary = creat_summary.join(i_creat,how='left')
creat_summary = creat_summary.join(f_creat,how='left')
creat_summary['creat_aki']=pd.notnull(creat_summary['i_val'])

In [354]:
lr_data = cohort[['icustay_id','age','los','max_lactate','vaso_frac','gender']]
lr_data['eth'] = eth
lr_data = lr_data.set_index('icustay_id')

lr_data = lr_data.join(min_maps,how='left')
lr_data.rename(columns={'value':'min_map','bin':'min_map_bin'},inplace=True)

lr_data = lr_data.join(creat_summary['creat_aki'],how='left')