In [34]:
import numpy as np
import pandas as pd
import math
pd.options.mode.chained_assignment = None
#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

In [35]:
# 25th Percentile
def p25(x):
    return np.percentile(x,25)

# 75th Percentile
def p75(x):
    return np.percentile(x,75)

# create all lab features for creating the model
def create_lab_features(cohort, table, output_file_name, table_name):
    print("started creating features from ",table_name)
    
    # Create the empty dataframe with all the identifiers:
    cohort = cohort.sort_values(by=['identifier'])
    idents = cohort['identifier']
    df = pd.DataFrame(idents, columns=['identifier'])

    # for calculating stats for each label:
    features = [' max', ' min', ' average', ' median', ' 25th percentile', ' 75th percentile', ' std', ' amount', ' last'] 
    # for calculating delta between the value of last measurement of the label and each stat:
    stats = [' max', ' min', ' average', ' median', ' 25th percentile', ' 75th percentile']
    delta_str = ', delta bw last,'
    # for calculating the time from the min/max/last measurement for each label to target time (in hours):
    time = [' max', ' min', ' last']
    time_str = ', hours to target from' 
    
    #create the columns with nan values:
    labels = sorted(table.label.unique()) # all labels in table, sorted
    for label in labels:
        for feat in features:
            df[label + feat] = np.nan
    for label in labels:    
        for stat in stats:
            df[label + delta_str + stat] = np.nan
    for label in labels:
        for feat in time:
            df[label + time_str + feat] = np.nan   
    
    # create the final df with all the statistics for all identifiers:
    df_final = pd.DataFrame()
    first_iter = 1
    cnt = 0 #for prints
    sum = 0 #for prints
    for iden in df['identifier']:
        df_relevant_row = df.loc[df['identifier'] == iden] # the row of the specific identifier in df
        table_relevant_rows = table.loc[table['identifier'] == iden] # the rows of the specific identifier in table
        grouped = table_relevant_rows.groupby('label')
        stats_df = grouped.agg({'value': [np.max, np.min, np.average, np.median, p25, p75, np.std, len]})
        stats_df.columns = stats_df.columns.droplevel(0)
        stats_df.reset_index()
        
        # add the calculated statistics from stat_df to the right cell in df_relevant_row
        for label, row in stats_df.iterrows():
            for feat in features[:-1]:
                value = row[features.index(feat)]
                df_relevant_row[label + feat] = value
                if feat == ' std' and np.isnan(value):
                    df_relevant_row[label + feat] = 0.0 #if std is nan (because there is only one value) then it is 0.
        
        # sort by charttime for extracting times and last value
        table_relevant_rows = table_relevant_rows.sort_values(by=['label', 'charttime'])
        
        iden_labels = table_relevant_rows['label'].unique()
        for label in iden_labels:
            label_rows = table_relevant_rows.loc[table_relevant_rows['label'] == label]
            # add the last measurement of the label to df_relevant_row
            last_val = label_rows['value'].iloc[-1]
            df_relevant_row[label + ' last'] = last_val 
            # add delta between the last measurement and the different stats:
            for stat in stats:
                delta = last_val - df_relevant_row[label + stat]
                df_relevant_row[label + delta_str + stat] = delta
            # add time from the last, min and max measurements to target time (in hours):
            last_time = label_rows['hours_from_charttime_to_targettime'].iloc[-1] 
            df_relevant_row[label + time_str + ' last'] = last_time
            for t_feat in time[:-1]: # max and min
                val = df_relevant_row[label + t_feat].values[0]
                val_charttime = label_rows[label_rows['value'] == val]['hours_from_charttime_to_targettime'].values[-1]
                df_relevant_row[label + time_str + t_feat] = val_charttime

        #add the relevant row to final df:
        df_final = pd.concat([df_final, df_relevant_row])

        cnt += 1
        if cnt == 200:
            sum += cnt
            cnt = 0
            print("done {} patients.".format(sum))
        
    print("done creating features from ",table_name) 
    print("shape: ",df_final.shape)

    #display(df_final)
    df_final.to_csv(output_file_name, encoding='utf-8', index=False)
    
    print("done creating {}_for_modeling.csv\n".format(table_name))
    

In [36]:
def p75(x):
    return np.percentile(x,75)

def last(x):
    return x.iloc[-1]

# create only the lab features that were selected in the feature selection when creating the model
def create_relev_features(cohort, table, output_file_name, table_name):
    print("started creating features from ",table_name)
    # Create the empty dataframe with all the identifiers:
    cohort = cohort.sort_values(by=['identifier'])
    idents = cohort['identifier']
    df = pd.DataFrame(idents, columns=['identifier'])
    
    #create the columns for the features with nan values:
    features = ['Anion Gap amount', 
                'Bicarbonate, delta bw last, min', 
                'Calculated Total CO2, hours to target from min', 
                'Chloride, delta bw last, max', 
                'Glucose amount',
                'Heart Rate last', 
                'Heart Rate max',
                'Hematocrit amount', 
                'Lactate, hours to target from max', 
                'Magnesium amount', 
                'pH, hours to target from min', 
                'Platelet Count last', 
                'Platelet Count min', 
                'Platelet Count, hours to target from max', 
                'pO2 75th percentile',
                'pO2 average', 
                'pO2 median', 
                'Potassium amount', 
                'Sodium amount'] 
    for feat in features:
        df[feat] = np.nan 
    
    # only the relevant labels:
    labels = ['Anion Gap', 'Bicarbonate', 'Calculated Total CO2', 'Chloride', 'Glucose', 'Heart Rate', 'Hematocrit',
              'Lactate', 'Magnesium', 'pH', 'Platelet Count', 'pO2', 'Potassium', 'Sodium']
    table = table[table['label'].isin(labels)]
    table = table.sort_values(by=['identifier', 'charttime'])

    # create the final df with the wanted statistics for all identifiers:
    grouped = table.groupby(['identifier','label'])
    stat_df = grouped.agg({'value': [len, np.max, np.min, np.average, np.median, p75, last]}) # how many labels at each charttime every patient has
    stat_df.columns = stat_df.columns.droplevel(0)
    
    for index, row in stat_df.iterrows(): #index is the identifier and label
        id = index[0]
        label = index[1]
        if label in ['Anion Gap', 'Glucose','Hematocrit','Magnesium','Potassium','Sodium']:
            df.loc[df['identifier'] == id, [label + ' amount']] = row['len']
        if label in ['Heart Rate', 'Platelet Count']:
            df.loc[df['identifier'] == id, [label + ' last']] = row['last']
        if label in ['Heart Rate']:
            df.loc[df['identifier'] == id, [label + ' max']] = row['amax']
        if label in ['Platelet Count']:
            df.loc[df['identifier'] == id, [label + ' min']] = row['amin']
        if label in ['pO2']:
            df.loc[df['identifier'] == id, [label + ' 75th percentile']] = row['p75']
            df.loc[df['identifier'] == id, [label + ' average']] = row['average']
            df.loc[df['identifier'] == id, [label + ' median']] = row['median']
        if label in ['Bicarbonate']:
            df.loc[df['identifier'] == id, [label + ', delta bw last, min']] = (row['last'] - row['amin'])
        if label in ['Chloride']:
            df.loc[df['identifier'] == id, [label + ', delta bw last, max']] = (row['last'] - row['amax'])
    
    cnt = 0 #for prints
    sum = 0 #for prints
    for id in idents:
        table_relevant_rows = table.loc[table['identifier'] == id]
        for label in ['Calculated Total CO2', 'pH']:
            df_ = table_relevant_rows.loc[table_relevant_rows['label'] == label]
            if len(df_) > 0:
                val =  df_[df_['value'] == min(df_['value'])]['hours_from_charttime_to_targettime'].values[-1] 
                df.loc[df['identifier'] == id, [label + ', hours to target from min']] = val
        for label in ['Lactate', 'Platelet Count']:
            df_ = table_relevant_rows.loc[table_relevant_rows['label'] == label]
            if len(df_) > 0:
                val =  df_[df_['value'] == max(df_['value'])]['hours_from_charttime_to_targettime'].values[-1] 
                df.loc[df['identifier'] == id, [label + ', hours to target from max']] = val
        cnt += 1
        if cnt == 200:
            sum += cnt
            cnt = 0
            print("relev - done {} patients.".format(sum))
         
    print("done creating features from ",table_name) 
    print("shape: ",df.shape)

    #display(df)
    df.to_csv(output_file_name, encoding='utf-8', index=False)
    
    print("done creating {}_for_modeling.csv\n".format(table_name))

In [37]:
# create only the respiratory features that were selected in the feature selection when creating the model

def create_resp_features(cohort, table, output_file_name, table_name):
    print("started creating features from ",table_name)
    
    # Create the empty dataframe with all the identifiers:
    cohort = cohort.sort_values(by=['identifier'])
    idents = cohort['identifier']
    df = pd.DataFrame(idents, columns=['identifier'])
    df = df.reset_index(drop=True)
    feature = 'Respiratory Rate, hours to target from min'
    # keep only Respiratory Rate records:
    table = table.loc[table['label'] == 'Respiratory Rate']
    
    # create the final df with the feature for all identifiers:
    values = [] 
    cnt = 0 #for prints
    sum = 0 #for prints
    for iden in idents:
        rr_rows = table.loc[table['identifier'] == iden] # the rows of the specific identifier in table
        if len(rr_rows) > 0:
            val = rr_rows[rr_rows['value'] == min(rr_rows['value'])]['hours_from_charttime_to_targettime'].values[-1]
        else:
            val = np.nan
        values.append(val)

        cnt += 1
        if cnt == 500:
            sum += cnt
            cnt = 0
            print("done {} patients.".format(sum))

    s = pd.Series(values)
    df[feature] = s
        
    print("done creating features from ",table_name) 
    print("shape: ",df.shape)

    #display(df)
    df.to_csv(output_file_name, encoding='utf-8', index=False)
    
    print("done creating {}_for_modeling.csv\n".format(table_name))