In [37]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None 
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [52]:
def create_culture_features(cohort, table, output_file_name, culture_kind, model, table_name): 
    print("started creating features from ",table_name)
    orig_table_len = len(table)
    if model == 'a':
        days_limit = 'at_least_3_days_prior_targettime'
    elif model == 'b':
        days_limit = 'at_least_2_days_prior_targettime'
    cohort = cohort.sort_values(by=['identifier'])
    
    # Create the empty dataframe with all the identifiers:
    idents = cohort['identifier']
    df = pd.DataFrame(idents, columns=['identifier'])
    
    #create the columns of features with 0 values:  
    features = ["At least one {} was taken during hospital stay prior to target".format(culture_kind), #0
                "Number of {}s taken during hospital stay prior to target".format(culture_kind), #1
                "Number of {}s taken in the ICU prior to target".format(culture_kind), #2
                "At least one {} was taken before ICU intime".format(culture_kind), #3
                "Positive/total {}s taken at least 3 days prior to target".format(culture_kind), #4
                "There was a positive {} during hospital stay prior to target".format(culture_kind), #5
               ]
    if culture_kind == 'Other Culture':
        features += ["Number of kinds of {}s taken during hospital stay prior to target".format(culture_kind), #6
                    "Number of kinds of {}s with at least one positive sample prior to target".format(culture_kind), #7
                    "Hours from when first {} was taken to target time".format(culture_kind), #8
                    "Hours from when last {} was taken to target time".format(culture_kind) #9
                   ]
        h_str = 'hours_from_charttime_to_targettime'
        remove_target = 0
        # remove rows of the same culture (only org_name is different and that's why the same row repeats):
        table = table.drop_duplicates(subset=['identifier', 'charttime', 'spec_type', 'result'])
    elif culture_kind == 'Blood Culture': 
        remove_target = 1 #for calculating the number of cultures without the target itself
        table = table.drop_duplicates(subset=['identifier', 'charttime', 'in_icu', 'result'])
    
    for feat in features:
        df[feat] = 0
    
    # create the final df with all the statistics for all identifiers:
    df_final = pd.DataFrame()
    table_idens = table.identifier.unique()
    for iden in df['identifier']:
        df_relevant_row = df.loc[df['identifier'] == iden] # the row of the specific identifier in df
        if iden in table_idens: #skip identifiers that's not in table
            table_relevant_rows = table.loc[table['identifier'] == iden].reset_index() # the rows of the specific identifier in table
            culture_count = len(table_relevant_rows) #number of cultures taken from the patient
            if (culture_kind == 'Blood Culture' and culture_count > 1) or (culture_kind == 'Other Culture'):
                df_relevant_row[features[0]] = 1
                df_relevant_row[features[1]] = culture_count - remove_target 
                in_icu = len(table_relevant_rows.loc[table_relevant_rows['in_icu'] == 1])
                df_relevant_row[features[2]] = in_icu - remove_target # the target was always taken in the ICU
                if culture_count-in_icu > 0: # there's at least one entry with in_icu=0
                    df_relevant_row[features[3]] = 1
                pos_before_df = table_relevant_rows.loc[(table_relevant_rows['result'] == 1) & 
                                                        (table_relevant_rows[days_limit] == 1)]
                pos_before = len(pos_before_df)
                if pos_before > 0:
                    all_before = len(table_relevant_rows.loc[table_relevant_rows[days_limit] == 1])
                    df_relevant_row[features[4]] = pos_before/all_before
                    df_relevant_row[features[5]] = 1
            if culture_kind == 'Other Culture':
                df_relevant_row[features[6]] = len(table_relevant_rows.spec_type.unique())
                df_relevant_row[features[7]] = len(pos_before_df.spec_type.unique())
                df_relevant_row[features[8]] = table_relevant_rows.at[0, h_str]
                df_relevant_row[features[9]] = table_relevant_rows.at[len(table_relevant_rows)-1, h_str]
    
                
        #add the relevant row to final df:
        df_final = pd.concat([df_final, df_relevant_row])
    
    
    print("done creating features from ",table_name) 
    print("shape: ",df_final.shape)
    df_final.to_csv(output_file_name, encoding='utf-8', index=False)
    print("done creating {}_for_modeling.csv\n".format(table_name))
    
    
    