In [None]:
%matplotlib inline
import pandas as pd
pd.options.mode.chained_assignment = None

import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

# load in data
data_path='../data_extraction/data/'
cohort = pd.read_table(data_path+'cohort.tsv')

In [None]:
#weightcovariates
#help(model_with_height.predict)
#df.ix[df.my_channel > 20000, 'my_channel'] = 0
#cohort.ix[cohort.index == 254478, 'weight']

In [None]:
# Multiple regression for weight based on age, sex, height if present.  

cohort.set_index('icustay_id', inplace=True)
cohort.ix[cohort.age > 90, 'age']=90

weightcovariates = cohort[['age','gender','height', 'weight']]
weightcovariates['agesquared'] = weightcovariates['age']**2
#weightcovariates = weightcovariates.dropna(subset=['weight'])
weightcovariates['gender'] = pd.Categorical(weightcovariates['gender'])
weightcovariates['gender'] = weightcovariates.gender.cat.codes

# Split into training (no weight) and testing. 
wc_train = weightcovariates.loc[weightcovariates['weight'].notnull()]
wc_test = weightcovariates.loc[weightcovariates['weight'].isnull()]

# Split training data into subsets with and without height. Train 2 models. 
c_with_height = wc_train.loc[wc_train['height'].notnull()][['age','agesquared', 'gender','height']]
c_without_height = wc_train.loc[wc_train['height'].isnull()][['age', 'agesquared', 'gender']]

w_with_height = wc_train.loc[wc_train['height'].notnull()]['weight']
w_without_height= wc_train.loc[wc_train['height'].isnull()]['weight']

# Split testing data into subsets with and without height. 
c_with_height_test = wc_test.loc[wc_test['height'].notnull()][['age','agesquared','gender','height']]
c_without_height_test = wc_test.loc[wc_test['height'].isnull()][['age','agesquared','gender']]

# Train the model using the training sets
model_with_height = sm.OLS(w_with_height, c_with_height)
model_without_height = sm.OLS(w_without_height, c_without_height)
results_with_height = model_with_height.fit()
results_without_height = model_without_height.fit()

# Predict the missing weights 
w_with_height_predict = results_with_height.predict(c_with_height_test) # 56 to 93kg
w_without_height_predict = results_without_height.predict(c_without_height_test) # 42 to 88kg. Seems reasonable. 

In [None]:
# Fill in the missing weights

for i in range(0, len(w_with_height_predict)):
    icu = c_with_height_test.iloc[[i]].index[0]
    cohort.ix[cohort.index == icu, 'weight'] = w_with_height_predict[i]
    #print(cohort.ix[cohort.index == icu, ['height', 'weight']])
for i in range(0, len(w_without_height_predict)):
    icu = c_without_height_test.iloc[[i]].index[0]
    cohort.ix[cohort.index == icu, 'weight'] = w_without_height_predict[i]    

#allweights = np.array(cohort['weight'])   
#np.where(np.isnan(allweights) == True)  # checking no more nans

In [None]:
# Process urine values 

urine = pd.read_csv(data_path+'urine.csv')
urine = urine.loc[urine['value'] < 5000]

icustays = np.unique(urine['icustay_id'])

# For each icustay key, the array of urine output volumes in 4h windows
urine_4h = {}
# Binary outcome of aki or not for each icustay, and starting time of 4h window aki was identified in  
aki_urine = np.zeros([len(icustays), 3], dtype='int') 


for icuind in range(0, len(icustays)):
    icustay = icustays[icuind]
    aki_urine[icuind,0]=icustay
    
    # Get all the urine values and times for the icustay_id 
    u= np.array(urine.loc[urine['icustay_id'] == icustay]['value']) 
    t = np.array(urine.loc[urine['icustay_id'] == icustay]['min_from_intime'])
    
    # Keep the time of first urine measurement, and get relative times. 
    t0=t[0]
    t=t-t0
    
    # Calculate urine output in 4 hour blocks starting from the time of first urine measurement. 
    # Hence the first urine measurement will not be used.
    
    # Urine volumes for four hour blocks, starting from the first measurement extending to or before the last urine measurement. 
    # Urine output for the block before the first measurement is not calculated.
    nblocks = int(np.ceil(t[-1]/ 240))
    urine_blocks = np.zeros(nblocks)
    
    # For every urine measurement, add the proportionate volume to the appropriate 4h windows. 
    for ind in range(1, len(u)):
        
        # Which 4h block index the measurement falls in 
        blocknum = int(t[ind]/ 240 )
        if blocknum == nblocks:
            blocknum = blocknum-1
        
        # Left time limit of the rectangle to calculate urine volume proportion that fits into block. 
        leftlimit_t = max(t[ind-1], int(t[ind]/240)*240)
        # The proportion of urine volume that belongs to the current 4h block
        propcurrent = (t[ind]-leftlimit_t)/(t[ind]-t[ind-1])
        urine_blocks[blocknum] += u[ind]*propcurrent
        
        # Add the proportion of urine volume to previous 4h blocks
        while (leftlimit_t!=t[ind-1]):
            blocknum= blocknum - 1
            leftlimit_t = max(t[ind-1], blocknum*240)
            propcurrent = ((blocknum+1)*240-leftlimit_t)/(t[ind]-t[ind-1])
            urine_blocks[blocknum] += u[ind]*propcurrent
    
    # Get the patient weight to calculate RIFLE criteria. 
    patientweight = cohort.loc[cohort.index == icustay]['weight'].values                       
            
    # Whether the urine block meets the I criteria
    urine_blocks_I = urine_blocks < (2 * patientweight)      
        
    # Find 3 consecutive 4h blocks that satisfy the I kidney injury criteria   
    aki_urine[icuind, 1]=0
    b=0
    while (b<nblocks-2) & (aki_urine[icuind, 1] == 0):
        if np.array_equal(urine_blocks_I[b:(b+3)], [True, True, True]):
            aki_urine[icuind, 1]=1
            # The starting time of the aki onset window
            aki_urine[icuind, 2] = int(b*240+t0)
        b+=1
    
    #  ----------  Optional: Save the 4h urine volumes. Very slow ------------- #   
    #urine_4h[icustay]=urine_blocks
    
    
# Convert to pandas dataframe
aki_urine_frame = pd.DataFrame(aki_urine[:,1:], index=aki_urine[:,0], columns=['aki_result', 'aki_onset_t'])
                

In [None]:
# The aki results of each icustay
aki_urine_frame

In [None]:
print("Number of urine aki patients: %s out of %s" % (sum(aki_urine[:,1]), len(aki_urine)))

In [None]:
# The 4 hour urine volumes (in a dictionary) if didn't comment out 
#urine_4h