In [54]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None 
#pd.set_option('display.max_rows', None)

In [55]:
def create_line_features(cohort, table, output_file_name, table_name): 
    print("started creating features from ",table_name)
    cohort = cohort.sort_values(by=['identifier'])
    
    # Create the empty dataframe with all the identifiers:
    idents = cohort['identifier']
    df = pd.DataFrame(idents, columns=['identifier'])
    
    #create the columns of features with 0 values:  
    labels = table.label.unique()
    for label in labels:
        df["{}, amount during hospital stay".format(label)] = 0
        df["{}, hours from first insertion to target".format(label)] = 0
        df["{} was inserted in the last day before target time".format(label)] = 0
        df["patient had {} during hospital stay".format(label)] = 0
    df["number of invasive lines inserted during hospital stay"] = 0
    
    # create the final df with all the statistics for all identifiers:
    df_final = pd.DataFrame()
    table_idens = table.identifier.unique()
    for iden in df['identifier']:
        df_relevant_row = df.loc[df['identifier'] == iden] # the row of the specific identifier in df
        if iden in table_idens: #skip identifiers that's not in table
            table_relevant_rows = table.loc[table['identifier'] == iden] # the rows of the specific identifier in table
            sum = 0
            for label in labels:
                label_rows = table_relevant_rows.loc[table_relevant_rows['label'] == label].reset_index()
                count = len(label_rows)
                if count > 0:
                    sum += count
                    df_relevant_row["{}, amount during hospital stay".format(label)] = count
                    df_relevant_row["{}, hours from first insertion to target".format(label)] = label_rows.at[0,'hours_from_starttime_to_targettime']
                    df_relevant_row["patient had {} during hospital stay".format(label)] = 1
                    time = label_rows['hours_from_starttime_to_targettime'].iloc[-1]
                    if (time <= 24.0 and time >= 0):
                        df_relevant_row["{} was inserted in the last day before target time".format(label)] = 1
                        #else is 0
                #if count == 0 all columns are 0.
            df_relevant_row["number of invasive lines inserted during hospital stay"] = sum
    
        #add the relevant row to final df:
        df_final = pd.concat([df_final, df_relevant_row])
    
    print("done creating features from ",table_name) 
    print("shape: ",df_final.shape)
    df_final.to_csv(output_file_name, encoding='utf-8', index=False)
    print("done creating {}_for_modeling.csv\n".format(table_name))
    