In [92]:
import numpy as np
import pandas as pd
import math
import time
pd.options.mode.chained_assignment = None
#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
#pd.reset_option("display.max_rows")

In [14]:
# for patients who have records of verbal, motor, and eye opening scores at the same charttime, but don't have 
# GCS Total record for that charttime, add GCS Total record as the sum of the other 3 scores.

def compute_gcs_total(table):
    print("started processing - calculating total GCS")
    idents = sorted(table.identifier.unique())
    new_rows = pd.DataFrame(columns=table.columns)
    cnt = 0 #for prints
    sum = 0
    for iden in idents:
        table_relevant_rows = table.loc[table['identifier'] == iden] # the rows of the specific identifier in table
        charttimes = sorted(table_relevant_rows.charttime.unique()) # all charttimes of iden, sorted
        for ctime in charttimes:
            checkup_df = table_relevant_rows.loc[table_relevant_rows['charttime'] == ctime].reset_index(drop=True) #all checks conducted at the same time
            if len(checkup_df) == 3: # all 3 checks were conducted, create a row of total score
                row_to_add = checkup_df.loc[0]
                gcs_total = checkup_df['score'].sum()
                row_to_add['score'] = gcs_total
                row_to_add['description'] = gcs_total
                row_to_add['label'] = 'GCS Total'
                new_rows = new_rows.append(row_to_add)
        cnt += 1
        if cnt == 200:
            sum += cnt
            cnt = 0
            print("processing gcs - done {} patients.".format(sum))
    
    new_table = pd.concat([table, new_rows])
    print("rows to add: ", len(new_rows))
    print('original len: ', len(table), "new len: ", len(new_table))
    #display(new_table)
    new_table = new_table.sort_values(by=['identifier', 'charttime', 'label'])
    
    print("done processing.")
    return new_table
    
        
    
    

In [101]:
# calculating only one feature that was selected for training the model (without calculating GCS Total)

def create_gcs_features_limited(cohort, table, output_file_name, table_name):
    print("started creating features from ",table_name)
    table = table.dropna(subset = ['score']) #remove rows where score is null
    # in the input patients who have GCS Total have only it, and those who don't have GCS Total have the other labels
    # (1-3 of them). we want to remove those who have less then 3, because we can't calculate GCS Total for them.
    total = table[table.label == 'GCS Total'] #save for later
    table = table[table.label != 'GCS Total']
    
    grouped = table.groupby(['identifier','charttime'])
    len_df = grouped.agg({'score': [len]}) # how many labels at each charttime every patient has
    len_df.columns = len_df.columns.droplevel(0)
    # remove those records that have less then 3 scores at one charttime:
    to_remove = len_df.loc[(len_df['len'] == 1) | (len_df['len'] == 2)]
    for index, row in to_remove.iterrows():
        table = table.drop(table[(table.identifier == index[0]) & (table.charttime == index[1])].index)
    # combine the remaining rows with the rows of GCS Total that we saved earlier:
    table = pd.concat([table, total])
    # Create the empty dataframe with all the identifiers:
    cohort = cohort.sort_values(by=['identifier'])
    idents = cohort['identifier']
    feature = 'GCS Total, hours to target from first'
    df = pd.DataFrame()
    df['identifier'] = idents
    df[feature] = np.nan
    df = df.reset_index(drop=True)
    
    table = table.sort_values(by=['identifier', 'charttime'])

    grouped = table.groupby('identifier')
    first_df = grouped.first() # get the first record of every patient

    # add the first measurement of each label to df_relevant_row:
    for index, row in first_df.iterrows(): #index is the identifier
        df.loc[df['identifier'] == index, [feature]] = row['hours_from_charttime_to_targettime']
            
    print("done creating features from ",table_name) 
    print("shape: ",df.shape)
    
    #display(df)
    df.to_csv(output_file_name, encoding='utf-8', index=False)
    
    print("done creating {}_for_modeling.csv\n".format(table_name))


In [4]:
def create_gcs_features_full(cohort, table, output_file_name, table_name):
    print("started creating features from ",table_name)
    table = compute_gcs_total(table)
    table = table.dropna(subset = ['score']) #remove rows where score is null

    # Create the empty dataframe with all the identifiers:
    cohort = cohort.sort_values(by=['identifier'])
    idents = cohort['identifier']
    df = pd.DataFrame(idents, columns=['identifier'])

    # for calculating stats for each label:
    features = [' max', ' min', ' average', ' amount', ' first', ' last'] 
    # for calculating delta between the value of last measurement of the label and each stat:
    stats = [' max', ' min', ' average', ' first']
    delta_str = ', delta bw last,'
    # for calculating the time from the min/max.. measurement for each label to target time (in hours):
    time = [' max', ' min', ' last', ' first']
    time_str = ', hours to target from' 
    
    #create the columns with nan values:
    labels = sorted(table.label.unique()) # all labels in table, sorted
    for label in labels:
        for feat in features:
            df[label + feat] = np.nan
    for label in labels:    
        for stat in stats:
            df[label + delta_str + stat] = np.nan
    for label in labels:
        for feat in time:
            df[label + time_str + feat] = np.nan   
    
    # create the final df with all the statistics for all identifiers:
    df_final = pd.DataFrame()
    cnt = 0 #for prints
    sum = 0
    for iden in df['identifier']: 
    #iden = '10774-173586' for debug
        df_relevant_row = df.loc[df['identifier'] == iden] # the row of the specific identifier in df
        table_relevant_rows = table.loc[table['identifier'] == iden] # the rows of the specific identifier in table
        grouped = table_relevant_rows.groupby('label')
        stats_df = grouped.agg({'score': [np.max, np.min, np.average, len]})
        stats_df.columns = stats_df.columns.droplevel(0)
        stats_df.reset_index()

        # add the calculated statistics from stat_df to the right cell in df_relevant_row
        for label, row in stats_df.iterrows():
            for feat in features[:-2]:
                value = row[features.index(feat)]
                df_relevant_row[label + feat] = value

        # sort by charttime for extracting times and last value
        table_relevant_rows = table_relevant_rows.sort_values(by=['label', 'charttime'])

        grouped = table_relevant_rows.groupby('label')
        last_df = grouped.last() # last measurement of each label
        first_df = grouped.first() # firsr measurement of each label

        # add the first measurement of each label to df_relevant_row:
        for label, row in first_df.iterrows():
            first_val = row['score']
            df_relevant_row[label + ' first'] = first_val
            #time from first measurement to target:
            df_relevant_row[label + time_str + ' first'] = row['hours_from_charttime_to_targettime'] 

        # add delta between the last measurement and the different stats:
        for label, row in last_df.iterrows():
            last_val = row['score']
            df_relevant_row[label + ' last'] = last_val #add the last measurement of the label to df_relevant_row
            for stat in stats:
                delta = last_val - df_relevant_row[label + stat]
                df_relevant_row[label + delta_str + stat] = delta
            #time from last measurement to target
            df_relevant_row[label + time_str + ' last'] = row['hours_from_charttime_to_targettime'] 

            # add time from the min and max measurements to target time (in hours):
            label_rows = table_relevant_rows.loc[table_relevant_rows['label'] == label]
            for t_feat in time[:2]:
                sc = df_relevant_row[label + t_feat].values[0]
                sc_charttime = label_rows[label_rows['score'] == sc]['hours_from_charttime_to_targettime'].values[-1]
                df_relevant_row[label + time_str + t_feat] = sc_charttime

        #add the relevant row to final df:
        df_final = pd.concat([df_final, df_relevant_row])

        cnt += 1
        if cnt == 200:
            sum += cnt
            cnt = 0
            print("done {} patients.".format(sum))
    
    print("done creating features from ",table_name) 
    print("shape: ",df_final.shape)

    #display(df_final)
    df_final.to_csv(output_file_name, encoding='utf-8', index=False)
    
    print("done creating {}_for_modeling.csv\n".format(table_name))
    