### Functions to clean up data after split train.

#### They are collected as I go throug EDA

In [10]:
import pandas as pd
import numpy as np

**Collect list of existing interesting features**

In [11]:
features = ['hospital_id','age','bmi',
            'ethnicity',
            'elective_surgery',
           'icu_admit_source',
           'apache_3j_bodysystem',
           'apache_3j_diagnosis',
           'apache_2_diagnosis',
           'h1_temp_min']

**Features to drop**

In [12]:
feat_drop = ['encounter_id','hospital_admit_source','height','weight',
             'apache_2_bodysystem']

**Features to keep, but drop rows**

In [13]:
def dropFeatRows(df):
    df = df.dropna(subset = ['icu_admit_source'],axis=0)
    df = df.dropna(subset = ['gender'],axis=0)
    # Apache 2 diagnosis, 3 diagnosis and bodysystem all share missng rows
    #df = df.dropna(subset = ['apache_3j_bodysytem'],axis=0)
    
    return df

**Features to keep, fillna**

In [14]:
def fillNA(df):
    # Replace with mean
    df['age'] = df['age'].fillna(df['age'].mean(skipna =True))
    
    # Replace BMI with median
    df['bmi'] = df['bmi'].fillna(df['bmi'].median(skipna =True))

    # Replace ethnicity with most common
    df['ethnicity'] = df['ethnicity'].fillna('Caucasian')
    
    return df

### Feature Engineering

In [15]:
# Get columns with 'h1' and create identifier column
def makeh1(df):
    hour1 = df.columns[['h1' in i for i in df.columns]]
    has_h1= (df[hour1].isna().sum(axis=1) > 0).astype(int)
    df = df.assign(has_h1=has_h1.values)
    
    return df

### Collapse apache, d1 and h1 measurements

In [16]:
def getLactate(df):
    # Get all columns containing lactate
    col = df.columns[['lactate' in i for i in df.columns]]
    
    # Create indicator column
    lactate_ind = (df[col].notna().sum(axis=1)>0).astype(int)
    df = df.assign(lactate_ind=lactate_ind.values)
    
    # Create aggregate column by MIN
    df = df.assign(lactate = np.min(df[col],axis=1))
    
    # Fill NA by 1
    df['lactate'] = df['lactate'].fillna(1)
    
    return df
    

In [17]:
def getBun(df):
    # Get all columns containing bun
    col = df.columns[['bun' in i for i in df.columns]]
    
    # Create indicator column
    
    
    # Create aggregate column by MEDIAN
    df = df.assign(bun = df[col].median(axis=1))
    
    # Fill NA by MEDIAN
    df['bun'] = df['bun'].fillna(df['bun'].median(skipna =True))
    
    return df

In [18]:
def getBilirubin(df):
    col = df.columns[['bilirubin' in i for i in df.columns]]
    df = df.assign(bilirubin = df[col].median(axis=1))
    df['bilirubin'] = df['bilirubin'].fillna(df['bilirubin'].median(skipna =True))
    
    # Drop columns?
    #df[col].drop()
    
    return df

In [19]:
def getACR(df):
    col = df.columns[['albumin' in i for i in df.columns]]
    df = df.assign(albumin = df[col].median(axis=1))
    df['albumin'] = df['albumin'].fillna(df['albumin'].median(skipna =True))
    
    # Drop columns?
    df = df.drop(columns=col)
    
    col = df.columns[['creatinine' in i for i in df.columns]]
    df = df.assign(creatinine = df[col].median(axis=1))
    df['creatinine'] = df['creatinine'].fillna(df['creatinine'].median(skipna =True))
    
    # Drop columns?
    df = df.drop(columns=col)
    
    df = df.assign(ACR = df.albumin/df.creatinine)
    
    return df

**Summarize**

In [None]:
def transform_data(df):
    df = dropFeatRows(df)
    df = getBilirubin(df)
    df = getBun(df)
    df = getLactate(df)
    df = makeh1(df)
    df = dropFeatRows(df)
    df = getACR(df)
    
    return df