In [None]:
from feature_engineering_utilities import *
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Observations with Labels**
---
Using the icd-hcc crosswalk, the icd diagnosis data for each admission and the admissions table, create a labeled dataset with the target variable. We will use this dataframe as the scaffolding to generate all the features that we need and then merge them on. The goal of this ipython notebook is to parse through all the different tables and create a final feature dataset that we can use for our modeling exercise.

In [None]:
icdxw = read_crosswalk()
diagnoses_icd = read_prod_data('diagnoses_icd')
admissions = read_prod_data('admissions')
df = add_aki_hcc_label(diagnoses_icd, icdxw, admissions)
hcc_data = create_hcc_labeled_dataset(diagnoses_icd, icdxw)

printer('labeled dataset')
df.shape
df.sample(5)
df.hcc_cd_135.value_counts(normalize=True)
del diagnoses_icd
del icdxw

Notice that 20% of the data are positives. We might want to consider undersampling the negatives or oversampling the positives (when we model the data) so that we can have an even distribution of this event.

**Demographic Features**
---
Read in the patients table and create demographic features. Some of the main risk factors for "Acute Kidney Injury" (AKI) are **age**, **gender** and **ethnicity**. We also pull out the **admission type** of the stay to see if knowing that the hospital admission was due to an emergency condition or an elective procedure could be useful to us.

In [None]:
patients = read_prod_data('patients')
demographic_features = create_demographics_features(admissions, patients)
del patients
printer('demographic features')
demographic_features.shape
demographic_features.columns
demographic_features.sample(5)

Note that some members don't have an age because they were so old that it might have been too easy to identify them. 
Therefore, their dates of birth were nulled out to protect their identity.

**HCC's for Comorbidities**
---
Create features for HCCs that are comorbidities.

In [None]:
diabetes_hcc_feature = create_hcc_feature(hcc_data, label='_19', rename_as='hcc_cd_19_dbtes_wo_comp')
ckd5_hcc_feature = create_hcc_feature(hcc_data, label='_136', rename_as='hcc_cd_136_ckd_stg_5')
ckd4_hcc_feature = create_hcc_feature(hcc_data, label='_137', rename_as='hcc_cd_137_ckd_stg_4')
chf_hcc_feature = create_hcc_feature(hcc_data, label='_85', rename_as='hcc_cd_85_chf')
vascular_disease_hcc_feature = create_hcc_feature(hcc_data, label='_108', rename_as='hcc_cd_108_vascular')

del hcc_data
printer('hcc features')
diabetes_hcc_feature.shape
ckd5_hcc_feature.shape
ckd4_hcc_feature.shape
chf_hcc_feature.shape
vascular_disease_hcc_feature.shape
vascular_disease_hcc_feature.sample(5)

**Prior Admissions / ICU Stays**
---

In [None]:
# admissions, icu stays
icustays = read_prod_data('icustays')
prior_admission_features = create_prior_admissions(admissions, icustays)
del icustays

**Prescriptions that are nephrotoxins**
---

In [None]:
# prescription
prescriptions = read_prod_data('prescriptions')
nephrotoxin_features = add_nephrotoxin_features(prescriptions, admissions)
del prescriptions

In [None]:
nephrotoxin_features.describe().T.sort_values('mean', ascending=False).head()

**Contrast Imaging Procedures**
---

In [None]:
cptevents = read_prod_data('cptevents')
contrast_imaging_feature = create_contrast_imaging_feature(cptevents)
# del cptevents

In [None]:
cptevents.head()

In [None]:
cptevents.loc[cptevents.description.notnull()]

**Charts Data**
---
View the feature space that's created for the chart data. Each feature space looks at another data point in the charts table.

In [None]:
# one_bin = False

In [None]:
# d_items = read_prod_data('d_items')

# if not one_bin:
#     bins = [hex(i)[2] + c for i in range(0, 16) for c in [hex(d)[2] for d in range(0, 16)]]   
# else:
#     bins = ['00', 'ff', '22', '3e']

# # chart features
# chart_features = [charts_data_wrapper(b, 
#                                       d_items, 
#                                       df, 
#                                       demographic_features,
#                                      i) for i,b in enumerate(bins)]
# chart_features = pd.concat(chart_features, sort=False)

# del d_items
# printer('charts features')
# chart_features.shape
# chart_features.columns
# chart_features.sample(5)

In [None]:
# chart_features.to_csv(result_dir + 'chart_features.csv', index=False)

In [None]:
chart_features = pd.read_csv(result_dir + 'chart_features.csv', dtype={'hadm_id': str})

In [None]:
chart_features.describe().T.sort_values('mean', ascending=False)

In [None]:
result_dir



#[x for x in _g if 'feature' in x]
    



    # chart data



    #data.to_csv(result_dir + 'features.csv', index=False)


**Merge & Create Final Dataframe**
---

In [None]:
features = [
    df.drop(['subject_id', 'admittime', 'dischtime'], axis=1),
    demographic_features,
    chart_features,
    diabetes_hcc_feature,
    ckd4_hcc_feature,
    ckd5_hcc_feature,
    chf_hcc_feature,
    vascular_disease_hcc_feature,
    prior_admission_features,
    nephrotoxin_features,
    contrast_imaging_feature
]

data = merge_features(features)
printer('final dataframe')
print(data.shape)

In [None]:
data.describe()

In [None]:
data.to_csv(result_dir + 'all_features.csv', index=False)