In [11]:
import numpy as np
import pandas as pd
from datetime import datetime
import datetime as dt
#pd.set_option('display.max_rows', None)
pd.options.mode.chained_assignment = None

In [12]:
def create_liquid_input_features(cohort, table, output_file_name, table_name, model_type, label):
    processed_table = prepare_table(table, model_type)
    df_daily = create_daily_data(processed_table)
    create_daily_features(cohort, df_daily, output_file_name, label, table_name)
    
def create_liquid_output_features(cohort, table, output_file_name, table_name, label):
    df_daily = create_daily_data(table)
    create_daily_features(cohort, df_daily, output_file_name, label, table_name)

In [19]:
# preprocess the data of mimic liquid input- create new row for every input that was administered during several days,
# with the amount administered that day (total amount/number of days)

def prepare_table(table, model):
    print("starting processing input_liquid_clean table.")
    table['charttime'] = table['start_date']
    # create column of date limit for including rows in calculations:
    # (if an input was given several days, and some of them were after target time, they should be excluded)
    if model == 'a':
        table['limit_time'] = table['target_time']
    elif model == 'b':
        table['limit_time'] = table['target_time'].apply(lambda x: x + dt.timedelta(days=1))
    table['duration_days'] = table['duration_days'].apply(lambda x: int(x))
    new_rows = pd.DataFrame(columns=table.columns)
    for index, row in table.iterrows():
        days = row['duration_days']
        if days > 1:
            daily_amount = row['originalamount']/days
            table.at[index,'originalamount'] = daily_amount
            row['originalamount'] = daily_amount
            for i in range(days-1):
                row['charttime'] += dt.timedelta(days=1)
                new_rows = new_rows.append(row)           
    
    new_table = pd.concat([table, new_rows])
    print("rows to add: ", len(new_rows))
    print('original len: ', len(table), "new len: ", len(new_table))
    
    new_table = new_table.sort_values(by=['identifier', 'charttime'])
    new_table = new_table.reset_index()
    new_table = new_table.rename(columns={"originalamount": "value"})
    new_table = new_table.drop(new_table[new_table['charttime'] > new_table['limit_time']].index)
    print("final len after dropping: ", len(new_table))
    print("done creating processed table") 
    #display(new_table)

    return new_table

In [8]:
def create_daily_data(table):    
    # add column 'date' that contains only the date part of charttime as datetime object
    table['date'] = table['charttime'].apply(lambda x: x.date())
    # add column 'target day' that contains only the date part of target_time as datetime object
    table['target day'] = table['target_time'].apply(lambda x: x.date())
    
    idents = table.identifier.unique()
    df_daily = pd.DataFrame(columns=['iden', 'date', 'value', 'target day'])
    for iden in idents:
        table_iden_rows = table.loc[table['identifier'] == iden] # the rows of the specific identifier in table
        target_day = table_iden_rows.loc[table_iden_rows.first_valid_index(), 'target day']
        
        sum_daily = table_iden_rows.groupby([table_iden_rows['date']])['value'].sum() # sum values for each date
        sum_daily = sum_daily.reset_index() # convert sum_dialy from series to dataframe
        sum_daily['iden'] = iden # add the column 'iden'
        sum_daily['target day'] = target_day
        sum_daily = sum_daily[['iden', 'date', 'value', 'target day']] # change order of columns

        df_daily = pd.concat([df_daily, sum_daily])
            
                
    df_daily = df_daily.reset_index(drop=True)
    df_daily.columns = ['identifier', 'date', 'daily amount', 'target day']
    #display(df_daily)
    print("done creating df_daily table")
    return df_daily


In [9]:
def create_daily_features(cohort, table, output_file_name, label, table_name): 
    print("started creating features from ",table_name)
    cohort = cohort.sort_values(by=['identifier'])
    
    # Create the empty dataframe with all the identifiers:
    idents = cohort['identifier']
    df = pd.DataFrame(idents, columns=['identifier'])
     
    # for calculating stats for each label:
    features = [' max', ' min', ' average', ' last'] 
    # for calculating delta between the value of last measurement of the label and each stat:
    stats = [' max', ' min', ' average']
    delta_str = ', delta bw last,'
    # for calculating the time from the min/max.. measurement for each label to target time (in rounded days):
    time_feat = [' max', ' min', ' last'] 
    time_str = ', days to target from' 
    
    #create the columns with nan values:    
    for feat in features:
        df[label + feat] = np.nan  
    for stat in stats:
        df[label + delta_str + stat] = np.nan
    for feat in time_feat:
        df[label + time_str + feat] = np.nan   
    
    # create the final df with all the statistics for all identifiers:
    df_final = pd.DataFrame()
    table_idens = table.identifier.unique()
    cnt = 0
    for iden in df['identifier']:
        df_relevant_row = df.loc[df['identifier'] == iden] # the row of the specific identifier in df
        if iden in table_idens: #skip identifiers that's not in table
            table_relevant_rows = table.loc[table['identifier'] == iden] # the rows of the specific identifier in table
            
            # extract the statistics and add them to the right cell in df_relevant_row:
            max_amount = max(table_relevant_rows['daily amount'])
            min_amount = min(table_relevant_rows['daily amount'])
            values = [] 
            values.append(max_amount) # max
            values.append(min_amount) # min
            values.append(np.mean(table_relevant_rows['daily amount'])) # average
            values.append(table_relevant_rows['daily amount'].iloc[-1]) # last
            i = 0
            for feat in features:
                df_relevant_row.loc[:, label + feat] = values[i]
                i+=1
            
            # add delta between the last measurement and the different stats:
            last_val = values[-1]
            for stat in stats:
                delta = last_val - df_relevant_row[label + stat]
                df_relevant_row.loc[:, label + delta_str + stat] = delta
              
            # add time from max/min/last measurement to target time (in approximate days):
            target_day = table_relevant_rows['target day'].iloc[-1] # in mimic this is a string
            last_day = table_relevant_rows['date'].iloc[-1]
            max_day = table_relevant_rows[table_relevant_rows['daily amount'] == max_amount]['date'].values[0]
            min_day = table_relevant_rows[table_relevant_rows['daily amount'] == min_amount]['date'].values[0]
            
            days = [max_day, min_day, last_day]
            i = 0
            for feat in time_feat:
                delta_days = (target_day - days[i]).days
                df_relevant_row.loc[:, label + time_str + feat] = delta_days
                i+=1
            
        #add the relevant row to final df:
        df_final = pd.concat([df_final, df_relevant_row])
    
    print("done creating features from ",table_name) 
    print("shape: ",df_final.shape)

    #display(df_final)
    df_final.to_csv(output_file_name, encoding='utf-8', index=False)
    
    print("done creating {}_for_modeling.csv\n".format(table_name))