In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None

import numpy as np
import matplotlib.pyplot as plt

In [3]:
# load in data
data_path='../data_extraction/data/'
cohort = pd.read_table(data_path+'cohort.tsv')

In [4]:
# numerical variable summaries
print cohort[['age','los','max_lactate','vaso_frac']].describe()

               age          los    max_lactate    vaso_frac
count  5342.000000  5342.000000    4485.000000  5342.000000
mean     71.703842    11.416165     226.160700     0.133613
std      48.474031    10.331876   14931.965231     0.214067
min      18.138720     3.000300       0.500000     0.000000
25%      52.494296     4.790000       1.500000     0.000000
50%      65.322301     7.903400       2.200000     0.006947
75%      76.984900    14.118000       3.800000     0.196396
max     309.777981   173.072500  999999.000000     0.995077


In [5]:
# gender summary
print pd.value_counts(cohort['gender'].values,normalize=True)

M    0.510857
F    0.489143
dtype: float64


In [6]:
# ethnicity summary
eth = cohort['ethnicity'].astype('category')

# grouping based on the standards at: http://grants.nih.gov/grants/guide/notice-files/NOT-OD-15-089.html
eth = eth.cat.add_categories(['HISPANIC/LATINO','MULTI/OTHER','UNKNOWN'])

eth[np.array([('HISPANIC' in i or
               'PORTUGUESE' in i) for i in eth],dtype=bool)]='HISPANIC/LATINO'
eth[np.array([('ASIAN' in i) for i in eth],dtype=bool)]='ASIAN'
eth[np.array([('BLACK' in i or
               'AFRICAN' in i) for i in eth],dtype=bool)]='BLACK/AFRICAN AMERICAN'
eth[np.array([('WHITE' in i or
               'MIDDLE EAST' in i) for i in eth],dtype=bool)]='WHITE'
eth[np.array([('MULTI' in i or
               'OTHER' in i) for i in eth],dtype=bool)]='MULTI/OTHER'
eth[np.array([('DECLINE' in i or
               'UNABLE' in i or
               'UNKNOWN' in i) for i in eth],dtype=bool)]='UNKNOWN'

eth = eth.cat.remove_unused_categories()

print pd.value_counts(eth.values,normalize=True,sort=False)

AMERICAN INDIAN/ALASKA NATIVE    0.000374
ASIAN                            0.019656
BLACK/AFRICAN AMERICAN           0.064770
WHITE                            0.727256
HISPANIC/LATINO                  0.031636
MULTI/OTHER                      0.027331
UNKNOWN                          0.128978
dtype: float64


In [173]:
# convert raw MAP readings to MAP features
# maps = pd.read_csv(data_path+'map.csv')
interval = 60

# average MAP for every hour
maps['hour'] = pd.Series((maps.min_from_intime/interval).astype(int), index=maps.index)

# get means for every hour
mean_maps = maps.groupby(['icustay_id', 'hour'])['value'].mean()
mean_maps = mean_maps.to_frame().reset_index().set_index(['icustay_id'])

# interpolate MAPs for missing values
min_hours = mean_maps.groupby([mean_maps.index.get_level_values(0)])['hour'].min()
max_hours = mean_maps.groupby([mean_maps.index.get_level_values(0)])['hour'].max()

interp_index = []
for this_icustay in min_hours.index:
    min_hour = min_hours.loc[this_icustay]
    max_hour = max_hours.loc[this_icustay]
    test =[hour for hour in np.arange(min_hour,max_hour+1)]
    interp_index += [(this_icustay, hour) for hour in np.arange(min_hour,max_hour+1)]

mean_maps = mean_maps.set_index(['hour'],append=True)
interp_mean_maps = mean_maps.reindex(pd.MultiIndex.from_tuples(interp_index,names=['icustay_id','hours']))
interp_mean_maps = interp_mean_maps['value'].interpolate(method='linear')
interp_mean_maps = interp_mean_maps.to_frame().reset_index()

# get percent of hours missing a MAP value
missing_map = len(interp_mean_maps.index) - len(mean_maps.index)

frac_missing = missing_map/float(len(interp_mean_maps.index))
print "Fraction of hours missing MAP values:", frac_missing

# get minimum MAP value per patient
min_ind = interp_mean_maps.groupby('icustay_id')['value'].idxmin(skipna=True)
min_maps = interp_mean_maps.loc[min_ind]

# bin MAP values
map_cutoffs = np.append(np.arange(30,100,10),200)
interp_mean_maps['bins'] = pd.cut(interp_mean_maps['value'], map_cutoffs)
binned_min_maps = pd.cut(min_maps['value'], map_cutoffs)

map_fracs = interp_mean_maps.groupby('icustay_id')['bins'].value_counts(normalize=True)

Fraction of hours missing MAP values: 0.0729135812521


In [174]:
# use creatinine measurements to determine AKI onset
creatinine = pd.read_csv(data_path+'creatinine.csv')
creatinine = creatinine.dropna()

# AKIN definition: increase in serum creatinine >= 0.3 mg/dL OR increase of >= 50% in serum creatinine
# OR reduction in urine output (<0.5 mL/kg) within 48 hrs
time_lim = 48 #hours

# calculate time and first creatinine measurement from admission
first_creat = creatinine.loc[creatinine.groupby(['icustay_id'])['min_from_intime'].idxmin(skipna=True)]
creatinine = creatinine.merge(first_creat,suffixes=('','_first'),on=['icustay_id'],how='left')

# get rid of values outside specified time window
creatinine = creatinine[(creatinine.min_from_intime_first < creatinine.min_from_intime) 
                        & (creatinine.min_from_intime < creatinine.min_from_intime_first+time_lim*60)]

# get max creatinine value within time window
max_creat = creatinine.loc[creatinine.groupby(['icustay_id'])['value'].idxmax(skipna=True)]

# nicely summarize creatinine data
d = {'icustay_id':max_creat['icustay_id'],
     'value_first':max_creat['value_first'],
     'value_max':max_creat['value'],}
aki_summary = pd.DataFrame(d)

# use first and max creatinine within 48 hrs to determine AKI
# TODO: incorporate urine information
aki_summary['aki'] = ((aki_summary.value_max-aki_summary.value_first>=0.3) | 
                      (aki_summary.value_max-aki_summary.value_first>=0.5*aki_summary.value_first))
aki = aki_summary.set_index('icustay_id')

In [None]:
lr_data = cohort[['icustay_id','age','los','max_lactate','vaso_frac','gender']]
lr_data['eth'] = eth
lr_data = lr_data.set_index('icustay_id')

lr_data = lr_data.join(min_maps,how='left')
lr_data.rename(columns={'value':'min_map'},inplace=True)

lr_data = lr_data.join(aki_summary['aki'],how='inner')
lr_data.rename(columns={'value':'aki'},inplace=True)