In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import os
import functools
from collections import defaultdict

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

_g = globals()

pd.options.display.max_rows = 200000
pd.options.display.max_columns = 500

now = dt.datetime.now().strftime('%Y-%m-%d-%H-%M')
result_dir = f'results/{now}/'
print(now)
os.makedirs(result_dir, exist_ok=True)

2019-12-01-09-32


In [2]:
def printer(x):
    print(f'{x}\n-------------------')

**Read in Data**
---

In [3]:
prod = False
prod_data = ['d_items', 'chartevents', 'admissions', 'prescriptions', 
             'diagnoses_icd', 'd_icd_diagnoses', 'patients', 'icustays', 'cptevents']

In [19]:
def read_dev_data(fname):
    datadir = 'demo-data/'
    table_name = fname[:-4]
    data = pd.read_csv(f'{datadir}{fname}', dtype=str, encoding='latin1')
    print(table_name, data.shape)
    _g[table_name] = data
    
def read_prod_data(table_name):
    datadir = 'data/'
    fname = table_name.upper() + '.csv.gz'
    data = pd.read_csv(f'{datadir}{fname}', dtype=str, encoding='latin1', 
                       compression='gzip')
    data.columns = [x.lower() for x in data.columns]
    print(table_name, data.shape)
    _g[table_name] = data
    
def read_charts_data(bin_id):
    _g['chartevents'] = pd.read_csv(f'split-data/chartevents/bin_{bin_id}.csv', dtype=str)
    
bins = [hex(i)[2] + c for i in range(0, 16) for c in [hex(d)[2] for d in range(0, 16)]]

In [20]:
if not prod:
    datadir = 'demo-data/'
    for fname in [x for x in os.listdir(datadir) if '.csv' in x]:    
        if 'icloud' in fname:
            continue
        read_dev_data(fname)

# also read in the crosswalk to make the label
icd2hccxw2014 = pd.read_csv('code_descriptions/icd2hccxw2014.csv', dtype='str')

procedureevents_mv (753, 25)
callout (77, 24)
d_cpt (134, 9)
d_items (12487, 10)
caregivers (7567, 4)
microbiologyevents (2003, 16)
labevents (76074, 9)
inputevents_cv (34799, 22)
admissions (129, 19)
d_labitems (753, 6)
datetimeevents (15551, 14)
prescriptions (10398, 19)
procedures_icd (506, 5)
noteevents (0, 11)
chartevents (758355, 15)
transfers (524, 13)
diagnoses_icd (1761, 5)
services (163, 6)
drgcodes (297, 8)
outputevents (11320, 13)
patients (100, 8)
d_icd_diagnoses (14567, 4)
icustays (136, 12)
inputevents_mv (13224, 31)
d_icd_procedures (3882, 4)
cptevents (1579, 12)


**Make Labeled DataFrame**
---

In [13]:
def add_aki_hcc_label(diagnoses, icdxw, d_icd_diagnoses, admissions):
    """take the diagnoses dataframe and create the beginning of 
    a labeled dataset"""
    merged = pd.merge(diagnoses, icdxw, how='left', 
                      left_on='icd9_code', right_on='icd', 
                      indicator='_hcc_merge', validate='m:1')
    
    # pull out the diagnoses codes that don't map to an hcc (may be useful)
    not_merged = merged.loc[merged._hcc_merge == 'left_only']
    not_merged = not_merged.merge(d_icd_diagnoses, how='left', on='icd9_code')
    not_merged.short_title.value_counts().to_csv(result_dir + 
                                                 'unmerged_diagnosis_to_hcc.csv', 
                                                 header=True)
            
    # remove them from df and make final labeled dataframe
    merged = pd.concat([merged, 
                        pd.get_dummies(merged.hcc, prefix='hcc_cd')], axis=1)
    cols = [x for x in merged if 'hcc_cd' in x]
    data = merged.groupby(['hadm_id', 'subject_id'], as_index=False)[cols].max()
    
    def statistics_on_hcc_labels(df):
        data = pd.concat([df.describe().T, 
               df.drop(['hadm_id', 'subject_id'], axis=1).sum()], axis=1)\
                .rename(columns={0:'sum'})\
                .sort_values('mean', ascending=False)
        print(data.shape)
        data.to_csv(result_dir + 'most_coded_hccs.csv')
    
    statistics_on_hcc_labels(data)
    
    drop_cols = [x for x in cols if '_135' not in x]
    data = data.drop(drop_cols, axis=1)
    
    # merge in admissions
    data = data.merge(admissions[['hadm_id', 'subject_id', 
                                  'admittime', 'dischtime']], 
                      how='left', on=['hadm_id', 'subject_id'])
    
    # trasnform datatypes
    for i in [x for x in data if 'time' in x]:
        data[i] = pd.to_datetime(data[i])
    
    return data
    

df = add_aki_hcc_label(diagnoses_icd, icd2hccxw2014, 
                       d_icd_diagnoses, admissions)
df.sample(5)
df.shape
df.hcc_cd_135.value_counts(normalize=True)

(67, 9)


Unnamed: 0,hadm_id,subject_id,hcc_cd_135,admittime,dischtime
89,170883,10124,1,2192-04-16 20:57:00,2192-05-15 19:28:00
20,118192,41795,0,2145-09-06 08:52:00,2145-09-09 16:25:00
19,117105,42135,1,2127-10-06 21:00:00,2127-10-28 12:50:00
88,170119,10074,1,2167-02-11 22:10:00,2167-02-19 15:30:00
76,165436,10119,0,2117-08-21 06:58:00,2117-08-26 13:00:00


(129, 5)

0    0.604651
1    0.395349
Name: hcc_cd_135, dtype: float64

**Charts Data**
---

In [14]:
def make_clean_charts_data(chartevents, d_items, label):

    # merge in labels
    charts = chartevents.merge(d_items, how='left', on='itemid', indicator='_d_items')
    f'charts data shape: {charts.shape}'
    
    if (charts._d_items != 'both').any():
        print('merge statistics')
        charts._d_items.value_counts()

    # convert time fields to datetime
    for col in ['charttime', 'storetime']:
        charts[col] = pd.to_datetime(charts[col])

    # use the earliest time between events that are recorded directly in the chart
    # and events that are manually stored
    charts['eventtime'] = charts[['charttime', 'storetime']].min(axis=1)
    
    # change the valuenum field to numeric in case we need it
    charts['valuenum'] = charts['valuenum'].astype(float)
    
    # drop unnecessary columns
    drop_cols = ['conceptid', 'param_type', '_d_items', 'valueuom', 'warning',
                'error', 'resultstatus', 'stopped', 'row_id_x', 'row_id_y',
                'linksto']
    charts = charts.drop(drop_cols, axis=1)
    
    # make this upper case to avoid issues with spelling and capitalization
    charts['category'] = charts['category'].str.lower()\
                                            .str.replace('-', '')\
                                            .str.replace('  ', '')\
                                            .str.replace(' ', '_')\
                                            .str.replace(r"\'s", '')\
                                            .str.replace('\/', '_')\
                                            .str.replace('(', '')\
                                            .str.replace(')', '')
    charts['category'] = charts['category']
    
    # merge in label
    charts = charts.merge(label, how='left', on=['subject_id', 'hadm_id'])
    return charts

charts = make_clean_charts_data(chartevents, d_items, df)
charts.shape
charts.head()

(758355, 18)

Unnamed: 0,subject_id,hadm_id,icustay_id,itemid,charttime,storetime,cgid,value,valuenum,label,abbreviation,dbsource,category,unitname,eventtime,hcc_cd_135,admittime,dischtime
0,40124,126179,279554,223761,2130-02-04 04:00:00,2130-02-04 04:35:00,19085,95.9,95.9,Temperature Fahrenheit,Temperature F,metavision,routine_vital_signs,?F,2130-02-04 04:00:00,1,2130-02-04 02:26:00,2130-02-10 17:39:00
1,40124,126179,279554,224695,2130-02-04 04:25:00,2130-02-04 05:55:00,18999,2222221.7,2222221.7,Peak Insp. Pressure,Peak Insp. Pressure,metavision,respiratory,cmH2O,2130-02-04 04:25:00,1,2130-02-04 02:26:00,2130-02-10 17:39:00
2,40124,126179,279554,220210,2130-02-04 04:30:00,2130-02-04 04:43:00,21452,15.0,15.0,Respiratory Rate,RR,metavision,respiratory,insp/min,2130-02-04 04:30:00,1,2130-02-04 02:26:00,2130-02-10 17:39:00
3,40124,126179,279554,220045,2130-02-04 04:32:00,2130-02-04 04:43:00,21452,94.0,94.0,Heart Rate,HR,metavision,routine_vital_signs,bpm,2130-02-04 04:32:00,1,2130-02-04 02:26:00,2130-02-10 17:39:00
4,40124,126179,279554,220179,2130-02-04 04:32:00,2130-02-04 04:43:00,21452,163.0,163.0,Non Invasive Blood Pressure systolic,NBPs,metavision,routine_vital_signs,mmHg,2130-02-04 04:32:00,1,2130-02-04 02:26:00,2130-02-10 17:39:00


**Feature Investigation List**
---
- chronic liver disease
- sepsis
- mechanical ventilation
- anemia
- potassium
- sodium

**Done**
- medical imaging that uses contrast dyes
- prescriptions which can be nephrotoxins
- low blood ph
- hypertension
- hematocrit
- gender
- age
- ethnicity
- creatinine increases
- urine color
- urine appearance
- prior admission in 30, 60, 90, 120 days
- prior micu icustay in 30 days
- prior ccu icustay in 30 days

**Feature Discovery**
---

In [None]:
def create_contrast_imaging_feature(cptevents):
    """cpt codes for imaging with contrast dyes"""
    res = cptevents.loc[cptevents.cpt_cd.str.startswith('70') | 
                        cptevents.cpt_cd.str.startswith('71') |
                        cptevents.cpt_cd.str.startswith('72') |
                        cptevents.cpt_cd.str.startswith('73') |
                        cptevents.cpt_cd.str.startswith('74') |
                        cptevents.cpt_cd.str.startswith('75')]
    radiology_cpt_codes = [
        '74177',
        '74160',
        '71260',
        '74177',
        '73701',
        '73201',
        '70460',
        '70487',
        '70491',
        '70481',
        '72193',
        '72126',
        '72132',
        '72129',
        '75574',
        '75572',
        '70545',
        '70548'
    ] # codes with contrast from 2019 (i know it's not the right year)

    res['ft_contrast_imaging'] = res.cpt_cd.isin(radiology_cpt_codes)*1
    return res.groupby('hadm_id', as_index=False)['ft_contrast_imaging'].max()

In [None]:
def add_nephrotoxin_features(prescriptions, admissions):
    """add features for some drugs"""
    res = prescriptions.merge(admissions, how='left', on=['hadm_id', 'subject_id'])
    meds_list = {}
    meds_list['antibiotics'] = ['bacitracin', 
                                'vancomycin', 
                                'amphotericin', 
                                'cephalosporin', 
                                'aminoglycoloside',
                                'ciprofloxacin']
    meds_list['blood_pressure'] = ['lisinopril', 
                                   'ramipril', 
                                   'metoprolol', 
                                   'candesartan', 
                                   'valsartan', 
                                   'warfarin']
    meds_list['edema'] = ['furosemide']
    meds_list['nsaid'] = ['ibuprofen', 'naproxen', 'ketoprofen']
    meds_list['ulcer'] = ['cimetidine']
    meds_list['other'] = ['propofol']

    res['time_delta'] = pd.to_datetime(res.startdate) - pd.to_datetime(res.admittime)
    def within_x_hours(data, x):
        return data.time_delta < pd.Timedelta(x, 'hr')

    flatten = lambda l: [item for sublist in l for item in sublist]

    # all drugs
    drug = pd.Series([False for i in range(len(res))])
    for med in flatten(list(meds_list.values())):
        drug |= res.drug.str.lower().str.contains(med, na=False)

    res['ft_any_nephrotoxin_rx'] = drug*1
    for hr in [24, 48, 72]:
        res[f'ft_any_nephrotoxin_rx_within_{hr}'] = (drug & within_x_hours(res, hr))*1

    # groups of drugs
    for group, drugs in meds_list.items():
        drug = pd.Series([False for i in range(len(res))])
        for med in drugs:
            this_drug = res.drug.str.lower().str.contains(med, na=False)
            drug |= this_drug        # add to the large list
            res[f'ft_nephrotoxin_{med}_rx'] = this_drug*1  # make its own feature
            for hr in [24, 48, 72]:
                res[f'ft_nephrotoxin_{med}_rx_within_{hr}'] = (this_drug & within_x_hours(res, hr))*1

        # any drug in the group
        res[f'ft_nephrotoxin_{group}_rx'] = drug*1
        for hr in [24, 48, 72]:
            res[f'ft_nephrotoxin_{group}_rx_within_{hr}'] = (drug & within_x_hours(res, hr))*1

    features = [x for x in res if 'ft_' in x]
    return res.groupby('hadm_id', as_index=False)[features].max()

In [None]:
def create_blood_ph_features(charts):
    """create features for the low blood ph"""
    res = charts.loc[charts.label.str.contains('pH') | charts.label.str.contains('PH'), 
                     ['hadm_id', 'eventtime', 'admittime', 'label', 
                      'value', 'valuenum', 'unitname']]
    res['label'] = 'blood_ph'
    
    # what is low blood ph
    lowbloodph = res.valuenum.lt(7.30)
    
    res['time_delta'] = res.eventtime - res.admittime    
    def within_x_hours(data, x):
        return data.time_delta < pd.Timedelta(x, 'hr')
    
    res['ft_low_blood_ph'] = lowbloodph*1
    res['ft_low_blood_ph_within_6_hrs'] = (lowbloodph & within_x_hours(res, 6))*1
    res['ft_low_blood_ph_within_12_hrs'] = (lowbloodph & within_x_hours(res, 12))*1
    res['ft_low_blood_ph_within_24_hrs'] = (lowbloodph & within_x_hours(res, 24))*1
    res['ft_low_blood_ph_within_36_hrs'] = (lowbloodph & within_x_hours(res, 36))*1
    res['ft_low_blood_ph_within_48_hrs'] = (lowbloodph & within_x_hours(res, 48))*1
    
    features = [x for x in res if 'ft_' in x]
    return res.groupby('hadm_id', as_index=False)[features].max()

In [None]:
def create_prior_admissions(admissions, icustays):
    """prior admissions"""
    # self merge
    res = admissions.merge(admissions, how='left', on=['subject_id'], suffixes=['_first', '_second'])
    
    # change the datatypes
    times = [x for x in res if 'time' in x]
    for i in times:
        res[i] = pd.to_datetime(res[i])
    
    # remove comparison with self
    res = res.loc[res.hadm_id_first != res.hadm_id_second]
    res = res.rename(columns={'hadm_id_first': 'hadm_id'})
    
    # add icu data
    res = res.merge(icustays, how='left', left_on='hadm_id_second', right_on='hadm_id', suffixes=['', '_icu'])
    res = pd.concat([res, pd.get_dummies(res.last_careunit.str.lower(), dtype=bool)], axis=1)
    
    # make features
    prior_admission_30 = (res.admittime_second - res.dischtime_first).dt.days.lt(30)
    prior_admission_60 = (res.admittime_second - res.dischtime_first).dt.days.lt(60)
    prior_admission_90 = (res.admittime_second - res.dischtime_first).dt.days.lt(90)
    prior_admission_120 = (res.admittime_second - res.dischtime_first).dt.days.lt(120)
    
    res['ft_prior_admission_30'] = prior_admission_30*1
    res['ft_prior_admission_60'] = prior_admission_60*1
    res['ft_prior_admission_90'] = prior_admission_90*1
    res['ft_prior_admission_120'] = prior_admission_120*1
    
    res['ft_avg_icu_los_within_30'] = np.where(prior_admission_30, res.los.astype(float), np.nan)
    res['ft_micu_within_30'] = (res.micu & prior_admission_30) * 1
    res['ft_ccu_within_30'] = (res.ccu & prior_admission_30) * 1
    
    features = [x for x in res if 'ft_' in x]
    return res.groupby('hadm_id', as_index=False)[features].max()

In [None]:
def create_hcc_feature(hccs, label='', rename_as=None):
    """select an hcc feature"""
    cols = [x for x in hccs if 'hcc_' in x]
    if label:
        drop_cols = [x for x in cols if label not in x]
        hccs = hccs.drop(drop_cols, axis=1)
    
    if rename_as:
        assert isinstance(rename_as, str)
        hccs = hccs.rename(columns={'hcc_cd' + label: rename_as})
    return hccs.drop('subject_id', axis=1)
    
def create_hcc_labeled_dataset(diagnoses, icdxw):
    """take the diagnoses dataframe and create the beginning of 
    a labeled dataset"""
    merged = pd.merge(diagnoses, icdxw, how='left', 
                      left_on='icd9_code', right_on='icd', 
                      indicator='_hcc_merge', validate='m:1')
            
    # remove them from df and make final labeled dataframe
    merged = pd.concat([merged, 
                        pd.get_dummies(merged.hcc, prefix='hcc_cd')], axis=1)
    cols = [x for x in merged if 'hcc_cd' in x]
    data = merged.groupby(['hadm_id', 'subject_id'], as_index=False)[cols].max()
    return data

In [None]:
def create_hypertensive_features(charts):
    """check the hypertensive status"""
    res = charts.loc[charts.label.str.lower().str.contains('diastolic'), 
                     ['hadm_id', 'eventtime', 'admittime', 'label', 
                      'value', 'valuenum', 'unitname']]
    res = res.loc[~res.label.str.lower().str.contains('unloading')] # remove the apache
    res = res.loc[~res.label.str.lower().str.contains('pulmonary')] # remove the apache
    res = res.loc[~res.label.str.lower().str.contains('pap')] # remove the apache
    res.label = 'diastolic_blood_pressure'
    
    res2 = charts.loc[charts.label.str.lower().str.contains('systolic'), 
                     ['hadm_id', 'eventtime', 'admittime', 'label', 
                      'value', 'valuenum', 'unitname']]
    res2 = res2.loc[~res2.label.str.lower().str.contains('unloading')] # remove the apache
    res2 = res2.loc[~res2.label.str.lower().str.contains('pulmonary')] # remove the apache
    res2 = res2.loc[~res2.label.str.lower().str.contains('pap')] # remove the apache
    res2.label = 'systolic_blood_pressure'
    
    # create combined events
    data = res2.merge(res, how='outer', on=['hadm_id', 'eventtime'], indicator=True, 
                 suffixes=['_systolic', '_diastolic'])
    data = data.loc[data._merge == 'both'].drop('_merge', axis=1)
    
    # delete stuff to clear memory
    del res
    del res2
    
    # make features
    elevated = data.valuenum_systolic.between(120, 129) & data.valuenum_diastolic.lt(80)
    hbp_stg_1 = data.valuenum_systolic.between(130, 139) | data.valuenum_diastolic.between(80, 89)
    hbp_stg_2 = data.valuenum_systolic.between(140, 179) | data.valuenum_diastolic.between(90, 119)
    crisis = data.valuenum_systolic.gt(180) | data.valuenum_diastolic.gt(120)
    
    # hours since admission
    data['time_delta'] = data.eventtime - data.admittime_systolic
    
    def within_x_hours(data, x):
        return data.time_delta < pd.Timedelta(x, 'hr')
    
    data['ft_elevated_bp'] = elevated*1
    data['ft_hbp_stg_1'] = hbp_stg_1*1
    data['ft_hbp_stg_2'] = hbp_stg_2*1
    data['ft_hbp_crisis'] = crisis*1
    
    data['ft_hbp_stg_2_within_6_hours'] = (hbp_stg_2 & within_x_hours(data, 6)) * 1
    data['ft_hbp_stg_2_within_12_hours'] = (hbp_stg_2 & within_x_hours(data, 12)) * 1
    data['ft_hbp_stg_2_within_24_hours'] = (hbp_stg_2 & within_x_hours(data, 24)) * 1
    data['ft_hbp_stg_2_within_36_hours'] = (hbp_stg_2 & within_x_hours(data, 36)) * 1
    data['ft_hbp_stg_2_within_48_hours'] = (hbp_stg_2 & within_x_hours(data, 48)) * 1
    
    features = [x for x in data if 'ft_' in x]
    data = data.groupby('hadm_id', as_index=False)[features].max()
    return data

In [None]:
def create_hematocrit_features(charts, pt):
    """ check hematocrit and hemoglobin levels for anemia """
    res = charts.loc[charts.label.str.lower().str.contains('hematocrit'), 
                     ['hadm_id', 'eventtime', 'admittime', 'label', 'value', 'valuenum', 'unitname']]
    res = res.loc[~res.label.str.lower().str.contains('apache')] # remove the apache

    # add the gender
    res = res.merge(pt[['hadm_id', 'ft_gender']], how='left', on='hadm_id')

    # clean the label name
    res['label'] = 'hematocrit'

    # add features
    male = res.ft_gender == 1
    male_range = (42, 50)
    female_range = (37, 47)
    above_normal = (male & (res.valuenum > male_range[1])) | (~male & (res.valuenum > female_range[1]))
    below_normal = (male & (res.valuenum < male_range[0])) | (~male & (res.valuenum < female_range[0]))
    way_below_normal = res.valuenum < 20

    res['ft_avg_hematocrit'] = res.valuenum
    res['ft_above_normal_hematocrit'] = above_normal*1
    res['ft_below_normal_hematocrit'] = below_normal*1
    res['ft_way_below_normal_hematocrit'] = way_below_normal*1
    res = res.drop('ft_gender', axis=1)
    
    agg = {'ft_avg_hematocrit': 'mean',
          'ft_above_normal_hematocrit': 'max',
          'ft_below_normal_hematocrit': 'max',
          'ft_way_below_normal_hematocrit': 'max'}

    return res.groupby('hadm_id', as_index=False).agg(agg)

In [None]:
def create_demographics_features(admissions, patients):
    """check admission information and patient demographics"""
    pt = admissions.merge(patients, how='left', on='subject_id')

    # convert data types
    pt['dob'] = pd.to_datetime(pt.dob)
    pt['admittime'] = pd.to_datetime(pt.admittime)

    # remap gender as a binary variable
    pt['ft_gender'] = pt.gender.map({'F': 0, 'M': 1})

    # create an age feature
    pt['ft_age'] = pt.admittime.sub(pt.dob, axis=0).dt.days / 365.25
    pt['ft_age'] = np.where(pt.ft_age < -1, np.nan, pt.ft_age) 
    # null out fake dobs, they don't give us information

    # admit type feature
    admit_type = pd.concat([pt.hadm_id, 
                            pd.get_dummies(pt['admission_type'].str.lower(),
                                        prefix='ft_admit_type')], axis=1)

    # ethnicity feature
    ethnicity = pd.concat([pt.hadm_id, 
                           pd.get_dummies(pt['ethnicity'].str.lower()
                                            .str.replace('/', '_')
                                            .str.replace(' ', '_'), 
                                            prefix='ft_race')], axis=1)
    
    data = pt[['hadm_id', 'ft_age', 'ft_gender']].merge(admit_type, 
                                                        how='left', on='hadm_id')
    data = data.merge(ethnicity, how='left', on='hadm_id')
    agg_dict = {'ft_age': 'mean', 'ft_gender': 'first'}
    agg_dict.update({k:'max' for k in admit_type if k != 'hadm_id'})
    agg_dict.update({k:'max' for k in ethnicity if k != 'hadm_id'})
    
    return data.groupby('hadm_id', as_index=False).agg(agg_dict)

In [None]:
def create_creatinine_features(charts, test=False):

    # make a dataframe of just creatinine data
    res = charts.loc[charts.label.str.lower().str.contains('creatin'), 
               ['hadm_id', 'eventtime', 'admittime', 'label', 
                'value', 'valuenum', 'unitname']]
    
    # drop crazy values
    res = res.loc[res.valuenum <= 11]
    
    # sort by hospital admission and event time
    res = res.sort_values(['hadm_id', 'eventtime']).reset_index(drop=True)

    # get the value of the old test and compare to current test
    res['value_of_previous_test'] = np.where(
        res.hadm_id == res.hadm_id.shift(1),
        res.valuenum.shift(1), res.valuenum)
    res['delta'] = res.valuenum - res.value_of_previous_test

    # get time previous test was administered and compare to current time
    res['time_of_previous_test'] = np.where(
        res.hadm_id == res.hadm_id.shift(1), 
        res.eventtime.shift(1), res.eventtime)
    res['delta_time'] = res.eventtime - res.time_of_previous_test

    # check if time is within a certain range
    res['within_48'] = res.delta_time <= pd.Timedelta(48, 'h')
    res['baseline_creat'] = res.groupby('hadm_id')['valuenum'].transform('first')

    # make features
    res['ft_creatinine_increase_within_48'] = ((res.delta >= 0.3) & res.within_48)*1
    res['ft_creatinine_increase_from_baseline'] = (res.valuenum >= 1.5*res.baseline_creat)*1
    res['ft_baseline_creat_gt_1'] = (res.baseline_creat > 1) * 1
    res['ft_avg_creatinine'] = res.valuenum

    if test:
        return res.loc[res.groupby('hadm_id')\
                       ['ft_creatinine_increase_within_48'].transform('max') == 1, 
                        ['hadm_id', 'valuenum', 'delta', 'delta_time',
                         'baseline_creat', 
                         'ft_creatinine_increase_within_48', 
                         'ft_creatinine_increase_from_baseline',
                         'ft_baseline_creat_gt_1']]
    
    features = [x for x in res if 'ft_' in x]
    return res.groupby('hadm_id', as_index=False)[features]\
              .agg({'ft_creatinine_increase_within_48': 'max',
                    'ft_creatinine_increase_from_baseline': 'max',
                    'ft_baseline_creat_gt_1': 'max',
                    'ft_avg_creatinine': 'mean'})

In [None]:
def create_urine_features(charts):

    res = charts.loc[charts.label.str.lower().str.contains('urine'), 
               ['hadm_id', 'eventtime', 'admittime', 'label', 'value', 'valuenum', 'unitname']]

    # clean the label column
    res['label'] = res.label.str.replace('[', '').str.replace(']', '')

    # urine color
    label = 'Urine Color'
    color = res.loc[res.label == label]
    color = pd.concat([color.hadm_id, pd.get_dummies(color.value.str.lower(), 
                                                 prefix=('ft_' + label.lower().replace(' ', '_')))],
                  axis=1)

    # urine appearance
    label = 'Urine Appearance'
    appearance = res.loc[res.label == label]
    appearance = pd.concat([appearance.hadm_id, pd.get_dummies(appearance.value.str.lower(), 
                                                           prefix=('ft' + label.lower().replace(' ', '_')))],
                  axis=1)
    
    data = color.merge(appearance, how='left', on='hadm_id')
    data = data.groupby('hadm_id', as_index=False)[[x for x in data if 'ft_' in x]].max()
    return data

**Final Feature Creation**
---

In [None]:
demographic_features = create_demographics_features(admissions, patients)
creatinine_features = create_creatinine_features(charts)
urine_features = create_urine_features(charts)
hematocrit_features = create_hematocrit_features(charts, demographic_features)
hypertensive_feature = create_hypertensive_features(charts)
hcc_labeled_data = create_hcc_labeled_dataset(diagnoses_icd, icd2hccxw2014)
diabetes_hcc_feature = create_hcc_feature(hcc_labeled_data, label='_19', rename_as='hcc_cd_19_dbtes_wo_comp')
ckd5_hcc_feature = create_hcc_feature(hcc_labeled_data, label='_136', rename_as='hcc_cd_136_ckd_stg_5')
ckd4_hcc_feature = create_hcc_feature(hcc_labeled_data, label='_137', rename_as='hcc_cd_137_ckd_stg_4')
chf_hcc_feature = create_hcc_feature(hcc_labeled_data, label='_85', rename_as='hcc_cd_85_chf')
vascular_disease_hcc_feature = create_hcc_feature(hcc_labeled_data, label='_108', rename_as='hcc_cd_108_vascular')
prior_admission_features = create_prior_admissions(admissions, icustays)
blood_ph_features = create_blood_ph_features(charts)
nephrotoxin_features = add_nephrotoxin_features(prescriptions, admissions)
contrast_imaging_feature = create_contrast_imaging_feature(cptevents)

In [None]:
def merge_features(feature_list):
    return functools.reduce(lambda x,y: pd.merge(x,y, how='outer', on='hadm_id'), feature_list)

In [None]:
features = [
    df.drop(['subject_id', 'admittime', 'dischtime'], axis=1),
    demographic_features,
    creatinine_features,
    urine_features,
    hematocrit_features,
    hypertensive_feature,
    diabetes_hcc_feature,
    ckd4_hcc_feature,
    ckd5_hcc_feature,
    chf_hcc_feature,
    vascular_disease_hcc_feature,
    prior_admission_features,
    blood_ph_features,
    nephrotoxin_features,
    contrast_imaging_feature
]

data = merge_features(features)
data