In [152]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [188]:
# load in data
data_path='../data_extraction/data/'

cohort = pd.read_table(data_path+'cohort.tsv')
creatinine = pd.read_csv(data_path+'creatinine.csv')
maps = pd.read_csv(data_path+'map.csv')
urine = pd.read_csv(data_path+'urine.csv')

In [159]:
# numerical variable summaries
print cohort[['age','los','max_lactate','vaso_frac']].describe()

               age          los    max_lactate    vaso_frac
count  5342.000000  5342.000000    4485.000000  5342.000000
mean     71.703842    11.416165     226.160700     0.133613
std      48.474031    10.331876   14931.965231     0.214067
min      18.138720     3.000300       0.500000     0.000000
25%      52.494296     4.790000       1.500000     0.000000
50%      65.322301     7.903400       2.200000     0.006947
75%      76.984900    14.118000       3.800000     0.196396
max     309.777981   173.072500  999999.000000     0.995077


In [155]:
# gender summary
print pd.value_counts(cohort['gender'].values, sort=False)

F    2613
M    2729
dtype: int64


In [156]:
# ethnicity summary
eth = cohort['ethnicity'].astype('category')

# grouping based on the standards at: http://grants.nih.gov/grants/guide/notice-files/NOT-OD-15-089.html
eth = eth.cat.add_categories(['HISPANIC/LATINO','MULTI/OTHER','UNKNOWN'])

eth[np.array([('HISPANIC' in i or
               'PORTUGUESE' in i) for i in eth],dtype=bool)]='HISPANIC/LATINO'
eth[np.array([('ASIAN' in i) for i in eth],dtype=bool)]='ASIAN'
eth[np.array([('BLACK' in i or
               'AFRICAN' in i) for i in eth],dtype=bool)]='BLACK/AFRICAN AMERICAN'
eth[np.array([('WHITE' in i or
               'MIDDLE EAST' in i) for i in eth],dtype=bool)]='WHITE'
eth[np.array([('MULTI' in i or
               'OTHER' in i) for i in eth],dtype=bool)]='MULTI/OTHER'
eth[np.array([('DECLINE' in i or
               'UNABLE' in i or
               'UNKNOWN' in i) for i in eth],dtype=bool)]='UNKNOWN'

eth = eth.cat.remove_unused_categories()

print pd.value_counts(eth.values, sort=False)

AMERICAN INDIAN/ALASKA NATIVE       2
ASIAN                             105
BLACK/AFRICAN AMERICAN            346
WHITE                            3885
HISPANIC/LATINO                   169
MULTI/OTHER                       146
UNKNOWN                           689
dtype: int64


In [331]:
# convert raw MAP readings to MAP features

# for now, limit readings to first 72 hours
num_hours = 72
maps = maps[(0 < maps.min_from_intime) & (maps.min_from_intime < num_hours*60)]

# average MAP for every hour
maps['hour'] = pd.Series((maps.min_from_intime/60).astype(int), index=maps.index)

# get means for every hour and minimum value
mean_maps = maps.groupby(['icustay_id', 'hour'])['value'].mean()
min_maps = maps.groupby(['icustay_id'])['value'].min()

# bin MAP values
# TODO: How to handle NaNs (no data recorded during that hour)?
map_cutoffs = np.append(np.arange(30,100,10),200)
binned_mean_maps = pd.cut(mean_maps, map_cutoffs)