In [1]:
from feature_engineering_utilities import *
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

2019-12-03-14-37


**Observations with Labels**
---
Using the icd-hcc crosswalk, the icd diagnosis data for each admission and the admissions table, create a labeled dataset with the target variable. We will use this dataframe as the scaffolding to generate all the features that we need and then merge them on. The goal of this ipython notebook is to parse through all the different tables and create a final feature dataset that we can use for our modeling exercise.

In [2]:
icdxw = read_crosswalk()
diagnoses_icd = read_prod_data('diagnoses_icd')
admissions = read_prod_data('admissions')
df = add_aki_hcc_label(diagnoses_icd, icdxw, admissions)
hcc_data = create_hcc_labeled_dataset(diagnoses_icd, icdxw)

printer('labeled dataset')
df.shape
df.sample(5)
df.hcc_cd_135.value_counts(normalize=True)
del diagnoses_icd
del icdxw

diagnoses_icd (651047, 5)
admissions (58976, 19)
labeled dataset
-------------------


(58976, 5)

Unnamed: 0,hadm_id,subject_id,hcc_cd_135,admittime,dischtime
52006,188176,25696,1,2170-09-27 16:10:00,2170-11-12 10:00:00
48383,181977,45213,1,2105-03-11 22:57:00,2105-04-05 16:15:00
58216,198677,23568,0,2114-10-15 17:31:00,2114-11-10 15:33:00
17119,128931,6750,0,2165-08-29 20:06:00,2165-09-14 12:40:00
30758,152136,66907,0,2145-10-03 19:35:00,2145-10-14 13:39:00


0    0.806328
1    0.193672
Name: hcc_cd_135, dtype: float64

Notice that 20% of the data are positives. We might want to consider undersampling the negatives or oversampling the positives (when we model the data) so that we can have an even distribution of this event.

**Demographic Features**
---
Read in the patients table and create demographic features. Some of the main risk factors for "Acute Kidney Injury" (AKI) are **age**, **gender** and **ethnicity**. We also pull out the **admission type** of the stay to see if knowing that the hospital admission was due to an emergency condition or an elective procedure could be useful to us.

In [3]:
patients = read_prod_data('patients')
demographic_features = create_demographics_features(admissions, patients)
del patients
printer('demographic features')
demographic_features.shape
demographic_features.columns
demographic_features.sample(5)

patients (46520, 8)
demographic features
-------------------


(58976, 47)

Index(['hadm_id', 'ft_age', 'ft_gender', 'ft_admit_type_elective',
       'ft_admit_type_emergency', 'ft_admit_type_newborn',
       'ft_admit_type_urgent', 'ft_race_american_indian_alaska_native',
       'ft_race_american_indian_alaska_native_federally_recognized_tribe',
       'ft_race_asian', 'ft_race_asian_asian_indian',
       'ft_race_asian_cambodian', 'ft_race_asian_chinese',
       'ft_race_asian_filipino', 'ft_race_asian_japanese',
       'ft_race_asian_korean', 'ft_race_asian_other', 'ft_race_asian_thai',
       'ft_race_asian_vietnamese', 'ft_race_black_african',
       'ft_race_black_african_american', 'ft_race_black_cape_verdean',
       'ft_race_black_haitian', 'ft_race_caribbean_island',
       'ft_race_hispanic_latino_central_american_(other)',
       'ft_race_hispanic_latino_colombian', 'ft_race_hispanic_latino_cuban',
       'ft_race_hispanic_latino_dominican',
       'ft_race_hispanic_latino_guatemalan',
       'ft_race_hispanic_latino_honduran', 'ft_race_hispanic_la

Unnamed: 0,hadm_id,ft_age,ft_gender,ft_admit_type_elective,ft_admit_type_emergency,ft_admit_type_newborn,ft_admit_type_urgent,ft_race_american_indian_alaska_native,ft_race_american_indian_alaska_native_federally_recognized_tribe,ft_race_asian,ft_race_asian_asian_indian,ft_race_asian_cambodian,ft_race_asian_chinese,ft_race_asian_filipino,ft_race_asian_japanese,ft_race_asian_korean,ft_race_asian_other,ft_race_asian_thai,ft_race_asian_vietnamese,ft_race_black_african,ft_race_black_african_american,ft_race_black_cape_verdean,ft_race_black_haitian,ft_race_caribbean_island,ft_race_hispanic_latino_central_american_(other),ft_race_hispanic_latino_colombian,ft_race_hispanic_latino_cuban,ft_race_hispanic_latino_dominican,ft_race_hispanic_latino_guatemalan,ft_race_hispanic_latino_honduran,ft_race_hispanic_latino_mexican,ft_race_hispanic_latino_puerto_rican,ft_race_hispanic_latino_salvadoran,ft_race_hispanic_or_latino,ft_race_middle_eastern,ft_race_multi_race_ethnicity,ft_race_native_hawaiian_or_other_pacific_islander,ft_race_other,ft_race_portuguese,ft_race_south_american,ft_race_unable_to_obtain,ft_race_white,ft_race_white_brazilian,ft_race_white_eastern_european,ft_race_white_other_european,ft_race_white_russian,ft_race_missing_info
50847,186216,58.234086,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
11045,118625,24.281999,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
47486,180402,59.835729,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
54735,192758,71.980835,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
54957,193147,48.284736,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


Note that some members don't have an age because they were so old that it might have been too easy to identify them. 
Therefore, their dates of birth were nulled out to protect their identity.

**HCC's for Comorbidities**
---
Create features for HCCs that are comorbidities. The main comoborbidities are as follows:
- diabetes (without complications)
- ckd (stage 5) and (stage 4)
- chf
- vascular disease
- chronic liver disease (which was not calculated)

In [4]:
diabetes_hcc_feature = create_hcc_feature(hcc_data, label='_19', rename_as='ft_hcc_cd_19_dbtes_wo_comp')
ckd5_hcc_feature = create_hcc_feature(hcc_data, label='_136', rename_as='ft_hcc_cd_136_ckd_stg_5')
ckd4_hcc_feature = create_hcc_feature(hcc_data, label='_137', rename_as='ft_hcc_cd_137_ckd_stg_4')
chf_hcc_feature = create_hcc_feature(hcc_data, label='_85', rename_as='ft_hcc_cd_85_chf')
vascular_disease_hcc_feature = create_hcc_feature(hcc_data, label='_108', rename_as='ft_hcc_cd_108_vascular')
hiv_aids_hcc_feature = create_hcc_feature(hcc_data, label='_1', rename_as='ft_hcc_cd_1_hiv_aids')
sepsis_feature = create_hcc_feature(hcc_data, label='_2', rename_as='ft_hcc_cd_2_sepsis')
chronic_hep_feature = create_hcc_feature(hcc_data, label='_29', rename_as='ft_hcc_cd_29_chronic_hepatitis')


del hcc_data
printer('hcc features')
diabetes_hcc_feature.shape
ckd5_hcc_feature.shape
ckd4_hcc_feature.shape
chf_hcc_feature.shape
vascular_disease_hcc_feature.shape
hiv_aids_hcc_feature.shape
sepsis_feature.shape
chronic_hep_feature.shape
vascular_disease_hcc_feature.sample(5)

hcc features
-------------------


(58976, 2)

(58976, 2)

(58976, 2)

(58976, 2)

(58976, 2)

(58976, 38)

(58976, 8)

(58976, 2)

Unnamed: 0,hadm_id,ft_hcc_cd_108_vascular
4399,107451,0
48785,182684,0
55884,194706,0
11825,119918,0
32560,155222,0


**Prior Admissions / ICU Stays**
---
One hypothesis is that prior admissions or icu stays for the patient within certain timeframes can be good predictors of acuity/severity of a condition or whether this patient's condition is worsening into AKI.

In [5]:
# admissions, icu stays
icustays = read_prod_data('icustays')
prior_admission_features = create_prior_admissions(admissions, icustays)
del icustays
printer('prior admission/icustays features')
prior_admission_features.shape
prior_admission_features.columns
prior_admission_features.sample(5)

icustays (61532, 12)
prior admission/icustays features
-------------------


(19993, 8)

Index(['hadm_id', 'ft_prior_admission_30', 'ft_prior_admission_60',
       'ft_prior_admission_90', 'ft_prior_admission_120',
       'ft_avg_icu_los_within_30', 'ft_micu_within_30', 'ft_ccu_within_30'],
      dtype='object')

Unnamed: 0,hadm_id,ft_prior_admission_30,ft_prior_admission_60,ft_prior_admission_90,ft_prior_admission_120,ft_avg_icu_los_within_30,ft_micu_within_30,ft_ccu_within_30
693,103434,1,1,1,1,3.8273,0,0
17564,187905,1,1,1,1,3.2479,1,0
6709,133755,1,1,1,1,3.5438,1,0
2120,110348,1,1,1,1,1.6514,1,0
17236,186194,1,1,1,1,2.0419,0,0


**Prescriptions**
---
Some prescriptions that are used for other diseases can cause AKI and are high risk factors for kidney injury. It would be interesting to know if prescription of those drugs can significantly increase risk in AKI.

In [6]:
# prescription
prescriptions = read_prod_data('prescriptions')
nephrotoxin_features = add_nephrotoxin_features(prescriptions, admissions)
del prescriptions
printer('nephrotoxins features')
nephrotoxin_features.shape
nephrotoxin_features.columns
nephrotoxin_features.sample(5)

prescriptions (4156450, 19)
	 antibiotics
		 bacitracin
		  24
		  48
		  72
		 vancomycin
		  24
		  48
		  72
		 amphotericin
		  24
		  48
		  72
		 cephalexin
		  24
		  48
		  72
		 cefadroxil
		  24
		  48
		  72
		 tobramycin
		  24
		  48
		  72
		 gentamicin
		  24
		  48
		  72
		 neomycin
		  24
		  48
		  72
		 ciprofloxacin
		  24
		  48
		  72
	 blood_pressure
		 lisinopril
		  24
		  48
		  72
		 ramipril
		  24
		  48
		  72
		 metoprolol
		  24
		  48
		  72
		 candesartan
		  24
		  48
		  72
		 valsartan
		  24
		  48
		  72
		 warfarin
		  24
		  48
		  72
	 diuretic
		 furosemide
		  24
		  48
		  72
		 torsemide
		  24
		  48
		  72
	 nsaid
		 ibuprofen
		  24
		  48
		  72
		 naproxen
		  24
		  48
		  72
	 ulcer
		 cimetidine
		  24
		  48
		  72
	 other
		 propofol
		  24
		  48
		  72
nephrotoxins features
-------------------


(50216, 113)

Index(['hadm_id', 'ft_any_nephrotoxin_rx', 'ft_any_nephrotoxin_rx_within_24',
       'ft_any_nephrotoxin_rx_within_48', 'ft_any_nephrotoxin_rx_within_72',
       'ft_nephrotoxin_bacitracin_rx',
       'ft_nephrotoxin_bacitracin_rx_within_24',
       'ft_nephrotoxin_bacitracin_rx_within_48',
       'ft_nephrotoxin_bacitracin_rx_within_72',
       'ft_nephrotoxin_vancomycin_rx',
       ...
       'ft_nephrotoxin_ulcer_rx_within_48',
       'ft_nephrotoxin_ulcer_rx_within_72', 'ft_nephrotoxin_propofol_rx',
       'ft_nephrotoxin_propofol_rx_within_24',
       'ft_nephrotoxin_propofol_rx_within_48',
       'ft_nephrotoxin_propofol_rx_within_72', 'ft_nephrotoxin_other_rx',
       'ft_nephrotoxin_other_rx_within_24',
       'ft_nephrotoxin_other_rx_within_48',
       'ft_nephrotoxin_other_rx_within_72'],
      dtype='object', length=113)

Unnamed: 0,hadm_id,ft_any_nephrotoxin_rx,ft_any_nephrotoxin_rx_within_24,ft_any_nephrotoxin_rx_within_48,ft_any_nephrotoxin_rx_within_72,ft_nephrotoxin_bacitracin_rx,ft_nephrotoxin_bacitracin_rx_within_24,ft_nephrotoxin_bacitracin_rx_within_48,ft_nephrotoxin_bacitracin_rx_within_72,ft_nephrotoxin_vancomycin_rx,ft_nephrotoxin_vancomycin_rx_within_24,ft_nephrotoxin_vancomycin_rx_within_48,ft_nephrotoxin_vancomycin_rx_within_72,ft_nephrotoxin_amphotericin_rx,ft_nephrotoxin_amphotericin_rx_within_24,ft_nephrotoxin_amphotericin_rx_within_48,ft_nephrotoxin_amphotericin_rx_within_72,ft_nephrotoxin_cephalexin_rx,ft_nephrotoxin_cephalexin_rx_within_24,ft_nephrotoxin_cephalexin_rx_within_48,ft_nephrotoxin_cephalexin_rx_within_72,ft_nephrotoxin_cefadroxil_rx,ft_nephrotoxin_cefadroxil_rx_within_24,ft_nephrotoxin_cefadroxil_rx_within_48,ft_nephrotoxin_cefadroxil_rx_within_72,ft_nephrotoxin_tobramycin_rx,ft_nephrotoxin_tobramycin_rx_within_24,ft_nephrotoxin_tobramycin_rx_within_48,ft_nephrotoxin_tobramycin_rx_within_72,ft_nephrotoxin_gentamicin_rx,ft_nephrotoxin_gentamicin_rx_within_24,ft_nephrotoxin_gentamicin_rx_within_48,ft_nephrotoxin_gentamicin_rx_within_72,ft_nephrotoxin_neomycin_rx,ft_nephrotoxin_neomycin_rx_within_24,ft_nephrotoxin_neomycin_rx_within_48,ft_nephrotoxin_neomycin_rx_within_72,ft_nephrotoxin_ciprofloxacin_rx,ft_nephrotoxin_ciprofloxacin_rx_within_24,ft_nephrotoxin_ciprofloxacin_rx_within_48,ft_nephrotoxin_ciprofloxacin_rx_within_72,ft_nephrotoxin_antibiotics_rx,ft_nephrotoxin_antibiotics_rx_within_24,ft_nephrotoxin_antibiotics_rx_within_48,ft_nephrotoxin_antibiotics_rx_within_72,ft_nephrotoxin_lisinopril_rx,ft_nephrotoxin_lisinopril_rx_within_24,ft_nephrotoxin_lisinopril_rx_within_48,ft_nephrotoxin_lisinopril_rx_within_72,ft_nephrotoxin_ramipril_rx,ft_nephrotoxin_ramipril_rx_within_24,ft_nephrotoxin_ramipril_rx_within_48,ft_nephrotoxin_ramipril_rx_within_72,ft_nephrotoxin_metoprolol_rx,ft_nephrotoxin_metoprolol_rx_within_24,ft_nephrotoxin_metoprolol_rx_within_48,ft_nephrotoxin_metoprolol_rx_within_72,ft_nephrotoxin_candesartan_rx,ft_nephrotoxin_candesartan_rx_within_24,ft_nephrotoxin_candesartan_rx_within_48,ft_nephrotoxin_candesartan_rx_within_72,ft_nephrotoxin_valsartan_rx,ft_nephrotoxin_valsartan_rx_within_24,ft_nephrotoxin_valsartan_rx_within_48,ft_nephrotoxin_valsartan_rx_within_72,ft_nephrotoxin_warfarin_rx,ft_nephrotoxin_warfarin_rx_within_24,ft_nephrotoxin_warfarin_rx_within_48,ft_nephrotoxin_warfarin_rx_within_72,ft_nephrotoxin_blood_pressure_rx,ft_nephrotoxin_blood_pressure_rx_within_24,ft_nephrotoxin_blood_pressure_rx_within_48,ft_nephrotoxin_blood_pressure_rx_within_72,ft_nephrotoxin_furosemide_rx,ft_nephrotoxin_furosemide_rx_within_24,ft_nephrotoxin_furosemide_rx_within_48,ft_nephrotoxin_furosemide_rx_within_72,ft_nephrotoxin_torsemide_rx,ft_nephrotoxin_torsemide_rx_within_24,ft_nephrotoxin_torsemide_rx_within_48,ft_nephrotoxin_torsemide_rx_within_72,ft_nephrotoxin_diuretic_rx,ft_nephrotoxin_diuretic_rx_within_24,ft_nephrotoxin_diuretic_rx_within_48,ft_nephrotoxin_diuretic_rx_within_72,ft_nephrotoxin_ibuprofen_rx,ft_nephrotoxin_ibuprofen_rx_within_24,ft_nephrotoxin_ibuprofen_rx_within_48,ft_nephrotoxin_ibuprofen_rx_within_72,ft_nephrotoxin_naproxen_rx,ft_nephrotoxin_naproxen_rx_within_24,ft_nephrotoxin_naproxen_rx_within_48,ft_nephrotoxin_naproxen_rx_within_72,ft_nephrotoxin_nsaid_rx,ft_nephrotoxin_nsaid_rx_within_24,ft_nephrotoxin_nsaid_rx_within_48,ft_nephrotoxin_nsaid_rx_within_72,ft_nephrotoxin_cimetidine_rx,ft_nephrotoxin_cimetidine_rx_within_24,ft_nephrotoxin_cimetidine_rx_within_48,ft_nephrotoxin_cimetidine_rx_within_72,ft_nephrotoxin_ulcer_rx,ft_nephrotoxin_ulcer_rx_within_24,ft_nephrotoxin_ulcer_rx_within_48,ft_nephrotoxin_ulcer_rx_within_72,ft_nephrotoxin_propofol_rx,ft_nephrotoxin_propofol_rx_within_24,ft_nephrotoxin_propofol_rx_within_48,ft_nephrotoxin_propofol_rx_within_72,ft_nephrotoxin_other_rx,ft_nephrotoxin_other_rx_within_24,ft_nephrotoxin_other_rx_within_48,ft_nephrotoxin_other_rx_within_72
8784,117345,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12147,124157,1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
41849,183389,1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,1,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1
16657,133150,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
16596,133030,1,1,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
nephrotoxin_features.describe().T.sort_values('mean', ascending=False).head()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ft_any_nephrotoxin_rx,50216.0,0.883682,0.320608,0.0,1.0,1.0,1.0,1.0
ft_any_nephrotoxin_rx_within_72,50216.0,0.845906,0.361043,0.0,1.0,1.0,1.0,1.0
ft_any_nephrotoxin_rx_within_48,50216.0,0.822148,0.382392,0.0,1.0,1.0,1.0,1.0
ft_any_nephrotoxin_rx_within_24,50216.0,0.7732,0.418766,0.0,1.0,1.0,1.0,1.0
ft_nephrotoxin_blood_pressure_rx,50216.0,0.619842,0.48543,0.0,0.0,1.0,1.0,1.0


**Contrast Imaging Procedures**
---
Radiology/Imaging that uses contrast dyes might cause AKI in patients with chronic kidney disease or those who are particular risks. In these settings, the dye itself is a nephrotoxin. **This feature is currently broken because I'm using some outdated CPT codes**.

In [8]:
cptevents = read_prod_data('cptevents')
contrast_imaging_feature = create_contrast_imaging_feature(cptevents)
printer('contrast dyes features')
contrast_imaging_feature.shape
contrast_imaging_feature.sample(5)
contrast_imaging_feature.describe()

cptevents (573146, 12)
contrast dyes features
-------------------


(44148, 2)

Unnamed: 0,hadm_id,ft_contrast_imaging
37527,185101,0
30452,169188,0
10684,124216,0
32612,174045,0
2753,106216,0


Unnamed: 0,ft_contrast_imaging
count,44148.0
mean,0.0
std,0.0
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,0.0


**Mechanical Ventilation**
---
Mechanical Ventilation according to some of the papers I read was a risk for kidney injury.

In [9]:
mechanical_ventilation_feature = create_mechanical_ventilation_feature(cptevents)
del cptevents
printer('mechanical ventilation features')
mechanical_ventilation_feature.shape
mechanical_ventilation_feature.sample(5)
mechanical_ventilation_feature.describe()

mechanical ventilation features
-------------------


(44148, 2)

Unnamed: 0,hadm_id,ft_mechanical_ventilation
33701,176458,0
18544,142063,0
43715,199008,1
9450,121387,0
25350,157640,0


Unnamed: 0,ft_mechanical_ventilation
count,44148.0
mean,0.371591
std,0.483235
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


**Charts Data**
---
View the feature space that's created for the chart data. Each feature space looks at another data point in the charts table.

In [10]:
one_bin = False
execute = False

In [11]:
if execute:
    d_items = read_prod_data('d_items')

    if not one_bin:
        bins = [hex(i)[2] + c for i in range(0, 16) for c in [hex(d)[2] for d in range(0, 16)]]   
    else:
        bins = ['00', 'ff', '22', '3e']

    # chart features
    chart_features = [charts_data_wrapper(b, 
                                          d_items, 
                                          df, 
                                          demographic_features,
                                         i) for i,b in enumerate(bins)]
    chart_features = pd.concat(chart_features, sort=False)

    del d_items
    printer('charts features')
    chart_features.shape
    chart_features.columns
    chart_features.sample(5)
    
    os.makedirs(result_dir, exist_ok=True)
    chart_features.to_csv(result_dir + 'chart_features.csv', index=False)

else:
    chart_features = pd.read_csv('results/2019-12-03-13-23/chart_features.csv', dtype={'hadm_id': str})

In [12]:
printer('chart data features')
chart_features.shape
chart_features.columns
chart_features.sample(5)
printer('\n')
chart_features.describe().T.sort_values('mean', ascending=False)

chart data features
-------------------


(56097, 25)

Index(['hadm_id', 'ft_creatinine_increase_within_48',
       'ft_creatinine_increase_from_baseline', 'ft_baseline_creat_gt_1',
       'ft_avg_creatinine', 'ft_avg_hematocrit', 'ft_above_normal_hematocrit',
       'ft_below_normal_hematocrit', 'ft_way_below_normal_hematocrit',
       'ft_elevated_bp', 'ft_abnormally_low_bp', 'ft_hbp_stg_1',
       'ft_hbp_stg_2', 'ft_hbp_crisis', 'ft_hbp_stg_2_within_6_hours',
       'ft_hbp_stg_2_within_12_hours', 'ft_hbp_stg_2_within_24_hours',
       'ft_hbp_stg_2_within_36_hours', 'ft_hbp_stg_2_within_48_hours',
       'ft_low_blood_ph', 'ft_low_blood_ph_within_6_hrs',
       'ft_low_blood_ph_within_12_hrs', 'ft_low_blood_ph_within_24_hrs',
       'ft_low_blood_ph_within_36_hrs', 'ft_low_blood_ph_within_48_hrs'],
      dtype='object')

Unnamed: 0,hadm_id,ft_creatinine_increase_within_48,ft_creatinine_increase_from_baseline,ft_baseline_creat_gt_1,ft_avg_creatinine,ft_avg_hematocrit,ft_above_normal_hematocrit,ft_below_normal_hematocrit,ft_way_below_normal_hematocrit,ft_elevated_bp,ft_abnormally_low_bp,ft_hbp_stg_1,ft_hbp_stg_2,ft_hbp_crisis,ft_hbp_stg_2_within_6_hours,ft_hbp_stg_2_within_12_hours,ft_hbp_stg_2_within_24_hours,ft_hbp_stg_2_within_36_hours,ft_hbp_stg_2_within_48_hours,ft_low_blood_ph,ft_low_blood_ph_within_6_hrs,ft_low_blood_ph_within_12_hrs,ft_low_blood_ph_within_24_hrs,ft_low_blood_ph_within_36_hrs,ft_low_blood_ph_within_48_hrs
41914,137110,0.0,0.0,0.0,0.6,36.099998,0.0,1.0,0.0,,,,,,,,,,,,,,,,
9298,173243,0.0,0.0,0.0,0.633333,38.85,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,,,,,,
31983,140711,0.0,0.0,0.0,0.5,40.05,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
22485,185917,1.0,0.0,1.0,5.075,29.275,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
895,140947,0.0,0.0,0.0,0.833333,26.35,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,,,,,,




-------------------


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ft_avg_hematocrit,54372.0,48.809042,3094.626263,8.65,28.4,31.428571,36.2,714312.35
ft_avg_creatinine,48676.0,1.351749,1.322985,-0.786364,0.7,0.9,1.375,11.0
ft_abnormally_low_bp,45300.0,0.95574,0.205676,0.0,1.0,1.0,1.0,1.0
ft_below_normal_hematocrit,54372.0,0.870172,0.336117,0.0,1.0,1.0,1.0,1.0
ft_elevated_bp,45300.0,0.808698,0.393331,0.0,1.0,1.0,1.0,1.0
ft_hbp_stg_1,45300.0,0.805099,0.396129,0.0,1.0,1.0,1.0,1.0
ft_hbp_stg_2,45300.0,0.715695,0.451088,0.0,0.0,1.0,1.0,1.0
ft_hbp_stg_2_within_48_hours,45300.0,0.560861,0.496288,0.0,0.0,1.0,1.0,1.0
ft_hbp_stg_2_within_36_hours,45300.0,0.522892,0.499481,0.0,0.0,1.0,1.0,1.0
ft_low_blood_ph,39397.0,0.505521,0.499976,0.0,0.0,1.0,1.0,1.0


In [13]:
labevents = read_prod_data('labevents')
d_labitems = read_prod_data('d_labitems')
labs = make_labs_data(labevents, d_labitems)
sodium_feature = create_sodium_feature(labs)
potassium_feature = create_potassium_feature(labs)
anemia_feature = create_anemia_feature(labs)
del labs
del labevents
del d_labitems

labevents (27854055, 9)
d_labitems (753, 6)


**Merge & Create Final Dataframe**
---

In [14]:
features = [
    df.drop(['subject_id', 'admittime', 'dischtime'], axis=1),
    demographic_features,
    chart_features,
    diabetes_hcc_feature,
    ckd4_hcc_feature,
    ckd5_hcc_feature,
    chf_hcc_feature,
    vascular_disease_hcc_feature,
    prior_admission_features,
    nephrotoxin_features,
    contrast_imaging_feature,
    mechanical_ventilation_feature,
    hiv_aids_hcc_feature,
    sepsis_feature,
    chronic_hep_feature,
    sodium_feature,
    potassium_feature,
    anemia_feature
]

data = merge_features(features)
printer('final dataframe')
print(data.shape)

final dataframe
-------------------
(58976, 246)


In [15]:
data.ft_admit_type_newborn.value_counts()
data = data.loc[data.ft_admit_type_newborn == 0]
printer('\ndata after dropping newborns')
print(data.shape)

0    51113
1     7863
Name: ft_admit_type_newborn, dtype: int64


data after dropping newborns
-------------------
(51113, 246)


In [16]:
data.describe().T.sort_values('mean', ascending=False)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ft_age,48497.0,62.140312,17.133341,0.0,51.597536,64.240931,75.88501,88.999316
ft_avg_hematocrit,48171.0,48.813432,3287.784909,8.65,28.058114,30.7,34.363942,714312.35
ft_avg_icu_los_within_30,13567.0,6.566538,9.670314,0.0003,1.8515,3.2851,6.9322,173.0725
ft_avg_creatinine,47931.0,1.362775,1.329613,-0.786364,0.7,0.91,1.4,11.0
ft_below_normal_hematocrit,48171.0,0.958834,0.198676,0.0,1.0,1.0,1.0,1.0
ft_abnormally_low_bp,40306.0,0.95028,0.217368,0.0,1.0,1.0,1.0,1.0
ft_elevated_bp,40306.0,0.908748,0.287971,0.0,1.0,1.0,1.0,1.0
ft_hbp_stg_1,40306.0,0.904431,0.294003,0.0,1.0,1.0,1.0,1.0
ft_any_nephrotoxin_rx,46619.0,0.888543,0.3147,0.0,1.0,1.0,1.0,1.0
ft_any_nephrotoxin_rx_within_72,46619.0,0.849654,0.357414,0.0,1.0,1.0,1.0,1.0


In [20]:
_16.index.tolist()

['ft_age',
 'ft_avg_hematocrit',
 'ft_avg_icu_los_within_30',
 'ft_avg_creatinine',
 'ft_below_normal_hematocrit',
 'ft_abnormally_low_bp',
 'ft_elevated_bp',
 'ft_hbp_stg_1',
 'ft_any_nephrotoxin_rx',
 'ft_any_nephrotoxin_rx_within_72',
 'ft_any_nephrotoxin_rx_within_48',
 'ft_admit_type_emergency',
 'ft_hbp_stg_2',
 'ft_prior_admission_120',
 'ft_prior_admission_90',
 'ft_any_nephrotoxin_rx_within_24',
 'ft_prior_admission_60',
 'ft_prior_admission_30',
 'ft_race_white',
 'ft_nephrotoxin_blood_pressure_rx',
 'ft_hbp_stg_2_within_48_hours',
 'ft_hbp_stg_2_within_36_hours',
 'ft_nephrotoxin_blood_pressure_rx_within_72',
 'ft_nephrotoxin_metoprolol_rx',
 'ft_gender',
 'ft_low_blood_ph',
 'ft_nephrotoxin_blood_pressure_rx_within_48',
 'ft_nephrotoxin_antibiotics_rx',
 'ft_hbp_stg_2_within_24_hours',
 'ft_nephrotoxin_diuretic_rx',
 'ft_nephrotoxin_furosemide_rx',
 'ft_nephrotoxin_metoprolol_rx_within_72',
 'ft_nephrotoxin_blood_pressure_rx_within_24',
 'ft_nephrotoxin_metoprolol_rx_within

In [17]:
os.makedirs(result_dir, exist_ok=True)
data.to_csv(result_dir + 'all_features.csv', index=False)

In [18]:
printer(result_dir)
print(os.listdir(result_dir))

results/2019-12-03-14-37/
-------------------
['all_features.csv']
