In [1]:
from feat_eng_prod import *
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

2019-12-01-15-45


**Observations with Labels**
---
Using the icd-hcc crosswalk, the icd diagnosis data for each admission and the admissions table, create a labeled dataset with the target variable. We will use this dataframe as the scaffolding to generate all the features that we need and then merge them on. The goal of this ipython notebook is to parse through all the different tables and create a final feature dataset that we can use for our modeling exercise.

In [2]:
icdxw = read_crosswalk()
diagnoses_icd = read_prod_data('diagnoses_icd')
admissions = read_prod_data('admissions')
df = add_aki_hcc_label(diagnoses_icd, icdxw, admissions)
hcc_data = create_hcc_labeled_dataset(diagnoses_icd, icdxw)

printer('labeled dataset')
df.shape
df.sample(5)
df.hcc_cd_135.value_counts(normalize=True)
del diagnoses_icd
del icdxw

diagnoses_icd (651047, 5)
admissions (58976, 19)
labeled dataset
-------------------


(58976, 5)

Unnamed: 0,hadm_id,subject_id,hcc_cd_135,admittime,dischtime
51069,186565,4778,0,2174-03-22 19:48:00,2174-03-28 21:35:00
57044,196681,18363,1,2179-03-31 22:29:00,2179-04-15 10:45:00
47960,181225,14817,0,2159-04-16 10:31:00,2159-04-20 17:00:00
2685,104495,26853,0,2187-08-07 19:48:00,2187-08-14 16:15:00
26891,145615,18038,0,2103-11-01 04:01:00,2103-11-12 08:38:00


0    0.806328
1    0.193672
Name: hcc_cd_135, dtype: float64

Notice that 20% of the data are positives. We might want to consider undersampling the negatives or oversampling the positives (when we model the data) so that we can have an even distribution of this event.

**Demographic Features**
---
Read in the patients table and create demographic features. Some of the main risk factors for "Acute Kidney Injury" (AKI) are **age**, **gender** and **ethnicity**. We also pull out the **admission type** of the stay to see if knowing that the hospital admission was due to an emergency condition or an elective procedure could be useful to us.

In [3]:
patients = read_prod_data('patients')
demographic_features = create_demographics_features(admissions, patients)
del patients
printer('demographic features')
demographic_features.shape
demographic_features.columns
demographic_features.sample(5)

patients (46520, 8)
demographic features
-------------------


(58976, 48)

Index(['hadm_id', 'ft_age', 'ft_gender', 'ft_admit_type_elective',
       'ft_admit_type_emergency', 'ft_admit_type_newborn',
       'ft_admit_type_urgent', 'ft_race_american_indian_alaska_native',
       'ft_race_american_indian_alaska_native_federally_recognized_tribe',
       'ft_race_asian', 'ft_race_asian_asian_indian',
       'ft_race_asian_cambodian', 'ft_race_asian_chinese',
       'ft_race_asian_filipino', 'ft_race_asian_japanese',
       'ft_race_asian_korean', 'ft_race_asian_other', 'ft_race_asian_thai',
       'ft_race_asian_vietnamese', 'ft_race_black_african',
       'ft_race_black_african_american', 'ft_race_black_cape_verdean',
       'ft_race_black_haitian', 'ft_race_caribbean_island',
       'ft_race_hispanic_latino_central_american_(other)',
       'ft_race_hispanic_latino_colombian', 'ft_race_hispanic_latino_cuban',
       'ft_race_hispanic_latino_dominican',
       'ft_race_hispanic_latino_guatemalan',
       'ft_race_hispanic_latino_honduran', 'ft_race_hispanic_la

Unnamed: 0,hadm_id,ft_age,ft_gender,ft_admit_type_elective,ft_admit_type_emergency,ft_admit_type_newborn,ft_admit_type_urgent,ft_race_american_indian_alaska_native,ft_race_american_indian_alaska_native_federally_recognized_tribe,ft_race_asian,ft_race_asian_asian_indian,ft_race_asian_cambodian,ft_race_asian_chinese,ft_race_asian_filipino,ft_race_asian_japanese,ft_race_asian_korean,ft_race_asian_other,ft_race_asian_thai,ft_race_asian_vietnamese,ft_race_black_african,ft_race_black_african_american,ft_race_black_cape_verdean,ft_race_black_haitian,ft_race_caribbean_island,ft_race_hispanic_latino_central_american_(other),ft_race_hispanic_latino_colombian,ft_race_hispanic_latino_cuban,ft_race_hispanic_latino_dominican,ft_race_hispanic_latino_guatemalan,ft_race_hispanic_latino_honduran,ft_race_hispanic_latino_mexican,ft_race_hispanic_latino_puerto_rican,ft_race_hispanic_latino_salvadoran,ft_race_hispanic_or_latino,ft_race_middle_eastern,ft_race_multi_race_ethnicity,ft_race_native_hawaiian_or_other_pacific_islander,ft_race_other,ft_race_patient_declined_to_answer,ft_race_portuguese,ft_race_south_american,ft_race_unable_to_obtain,ft_race_unknown_not_specified,ft_race_white,ft_race_white_brazilian,ft_race_white_eastern_european,ft_race_white_other_european,ft_race_white_russian
28086,147643,74.915811,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
9078,115278,79.356605,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
51988,188153,66.28063,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2020,103371,67.293634,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
42544,172127,74.502396,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


Note that some members don't have an age because they were so old that it might have been too easy to identify them. 
Therefore, their dates of birth were nulled out to protect their identity.

**HCC's for Comorbidities**
---
Create features for HCCs that are comorbidities.

In [4]:
diabetes_hcc_feature = create_hcc_feature(hcc_data, label='_19', rename_as='hcc_cd_19_dbtes_wo_comp')
ckd5_hcc_feature = create_hcc_feature(hcc_data, label='_136', rename_as='hcc_cd_136_ckd_stg_5')
ckd4_hcc_feature = create_hcc_feature(hcc_data, label='_137', rename_as='hcc_cd_137_ckd_stg_4')
chf_hcc_feature = create_hcc_feature(hcc_data, label='_85', rename_as='hcc_cd_85_chf')
vascular_disease_hcc_feature = create_hcc_feature(hcc_data, label='_108', rename_as='hcc_cd_108_vascular')

del hcc_data
printer('hcc features')
diabetes_hcc_feature.shape
ckd5_hcc_feature.shape
ckd4_hcc_feature.shape
chf_hcc_feature.shape
vascular_disease_hcc_feature.shape
vascular_disease_hcc_feature.sample(5)

hcc features
-------------------


(58976, 2)

(58976, 2)

(58976, 2)

(58976, 2)

(58976, 2)

Unnamed: 0,hadm_id,hcc_cd_108_vascular
30828,152248,1
17251,129145,0
52729,189409,1
52061,188260,1
12627,121317,0


**Prior Admissions / ICU Stays**
---

In [5]:
# admissions, icu stays
icustays = read_prod_data('icustays')
prior_admission_features = create_prior_admissions(admissions, icustays)
del icustays

icustays (61532, 12)


**Prescriptions that are nephrotoxins**
---

In [7]:
# prescription
prescriptions = read_prod_data('prescriptions')
nephrotoxin_features = add_nephrotoxin_features(prescriptions, admissions)
del prescriptions

prescriptions (4156450, 19)


In [13]:
nephrotoxin_features.describe().T.sort_values('mean', ascending=False).head()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ft_any_nephrotoxin_rx,50216.0,0.828959,0.376549,0.0,1.0,1.0,1.0,1.0
ft_any_nephrotoxin_rx_within_72,50216.0,0.785546,0.410447,0.0,1.0,1.0,1.0,1.0
ft_any_nephrotoxin_rx_within_48,50216.0,0.761968,0.425883,0.0,1.0,1.0,1.0,1.0
ft_any_nephrotoxin_rx_within_24,50216.0,0.71523,0.451309,0.0,0.0,1.0,1.0,1.0
ft_nephrotoxin_blood_pressure_rx,50216.0,0.619842,0.48543,0.0,0.0,1.0,1.0,1.0


**Contrast Imaging Procedures**
---

In [16]:
# cptevents
# cptevents = read_prod_data('cptevents')
# contrast_imaging_feature = create_contrast_imaging_feature(cptevents)
# del cptevents

cptevents (573146, 12)


**Charts Data**
---
View the feature space that's created for the chart data. Each feature space looks at another data point in the charts table.

In [37]:
one_bin = True

In [38]:
d_items = read_prod_data('d_items')

if not one_bin:
    bins = [hex(i)[2] + c for i in range(0, 16) for c in [hex(d)[2] for d in range(0, 16)]]   
else:
    bins = ['0']

# chart features
chart_features = [charts_data_wrapper(b, d_items, df, demographic_features) for b in bins]
chart_features = pd.concat(chart_features, sort=False)

del d_items
printer('charts features')
chart_features.shape
chart_features.columns
chart_features.sample(5)

d_items (12487, 10)
chartevents (1000000, 15)
creatinine features
-------------------
(1, 5)
hematocrit features
-------------------
(1, 5)
hypertensive features
-------------------
(1, 10)
blood ph features
-------------------
(1, 7)
charts features
-------------------


(1, 24)

Index(['hadm_id', 'ft_creatinine_increase_within_48',
       'ft_creatinine_increase_from_baseline', 'ft_baseline_creat_gt_1',
       'ft_avg_creatinine', 'ft_avg_hematocrit', 'ft_above_normal_hematocrit',
       'ft_below_normal_hematocrit', 'ft_way_below_normal_hematocrit',
       'ft_elevated_bp', 'ft_hbp_stg_1', 'ft_hbp_stg_2', 'ft_hbp_crisis',
       'ft_hbp_stg_2_within_6_hours', 'ft_hbp_stg_2_within_12_hours',
       'ft_hbp_stg_2_within_24_hours', 'ft_hbp_stg_2_within_36_hours',
       'ft_hbp_stg_2_within_48_hours', 'ft_low_blood_ph',
       'ft_low_blood_ph_within_6_hrs', 'ft_low_blood_ph_within_12_hrs',
       'ft_low_blood_ph_within_24_hrs', 'ft_low_blood_ph_within_36_hrs',
       'ft_low_blood_ph_within_48_hrs'],
      dtype='object')

ValueError: Cannot take a larger sample than population when 'replace=False'

In [39]:
chart_features

Unnamed: 0,hadm_id,ft_creatinine_increase_within_48,ft_creatinine_increase_from_baseline,ft_baseline_creat_gt_1,ft_avg_creatinine,ft_avg_hematocrit,ft_above_normal_hematocrit,ft_below_normal_hematocrit,ft_way_below_normal_hematocrit,ft_elevated_bp,ft_hbp_stg_1,ft_hbp_stg_2,ft_hbp_crisis,ft_hbp_stg_2_within_6_hours,ft_hbp_stg_2_within_12_hours,ft_hbp_stg_2_within_24_hours,ft_hbp_stg_2_within_36_hours,ft_hbp_stg_2_within_48_hours,ft_low_blood_ph,ft_low_blood_ph_within_6_hrs,ft_low_blood_ph_within_12_hrs,ft_low_blood_ph_within_24_hrs,ft_low_blood_ph_within_36_hrs,ft_low_blood_ph_within_48_hrs
0,0,1,1,0,1.512549,29.491215,1,1,1,1,1,1,1,0,0,0,0,0,1,0,0,0,0,0


In [None]:





    



    # chart data



    data.to_csv(result_dir + 'features.csv', index=False)


**Merge & Create Final Dataframe**
---

In [None]:
features = [
    df.drop(['subject_id', 'admittime', 'dischtime'], axis=1),
    demographic_features,
    chart_features_by_bin,
    diabetes_hcc_feature,
    ckd4_hcc_feature,
    ckd5_hcc_feature,
    chf_hcc_feature,
    vascular_disease_hcc_feature,
    prior_admission_features,
    nephrotoxin_features,
    contrast_imaging_feature
]

data = merge_features(features)
printer('final dataframe')
print(data.shape)