In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
from lofo import LOFOImportance, Dataset, plot_importance
%matplotlib inline

In [2]:
train = pd.read_csv('./train6.csv')
test  = pd.read_csv('./test6.csv')

In [3]:
train['hospital_admit_source'] = train['hospital_admit_source'].replace({'Other ICU': 'ICU','ICU to SDU':'SDU', 'Step-Down Unit (SDU)': 'SDU',
                                                                                               'Other Hospital':'Other','Observation': 'Recovery Room','Acute Care/Floor': 'Acute Care'})
test['hospital_admit_source'] = test['hospital_admit_source'].replace({'Other ICU': 'ICU','ICU to SDU':'SDU', 'Step-Down Unit (SDU)': 'SDU',
                                                                                               'Other Hospital':'Other','Observation': 'Recovery Room','Acute Care/Floor': 'Acute Care'})

In [4]:
train['icu_type'] = train['icu_type'].replace({'CCU-CTICU': 'Grpd_CICU', 'CTICU':'Grpd_CICU', 'Cardiac ICU':'Grpd_CICU'})
test['icu_type'] = test['icu_type'].replace({'CCU-CTICU': 'Grpd_CICU', 'CTICU':'Grpd_CICU', 'Cardiac ICU':'Grpd_CICU'})

In [5]:
cat_cols = [i for i in train.columns if type(train[i].iloc[0]) == str] + \
           ['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type', 'apache_3j_bodysystem', 'Glasglow_comma_score_t', 'weightclass']

In [6]:
y = train.hospital_death.values

train = train.drop(['hospital_death'], axis=1)

In [9]:
le = preprocessing.LabelEncoder()

for i in cat_cols:
    if type(train[i].iloc[0]) == str:
        print(i)
        train[i].fillna('NaN', inplace=True)
        test[i].fillna('NaN', inplace=True)

        le.fit(train[i].values)
        train[i] = le.transform(train[i].values)
        test[i] = le.transform(test[i].values)

ethnicity
gender
hospital_admit_source
icu_admit_source
icu_stay_type
icu_type
apache_3j_bodysystem
Glasglow_comma_score_t
weightclass


In [10]:
DROP_COLS = ['readmission_status', 'encounter_id', 'patient_id', \
             'apache_4a_hospital_death_prob', 
             'apache_4a_icu_death_prob', 
             'd1_calcium_min', 
             'd1_glucose_min']

train.drop(DROP_COLS, axis=1, inplace=True)
test.drop(DROP_COLS, axis=1, inplace=True)

In [13]:
train['hospital_death'] = y

In [None]:
# import data
#train_df = train

# extract a sample of the data
sample_df = train.sample(frac=0.25, random_state=0)
#sample_df.sort_values("AvSigVersion", inplace=True)

# define the validation scheme
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# define the binary target and the features
dataset = Dataset(df=sample_df, 
                  target="hospital_death", 
                  features=[col for col in train.columns if col != 'hospital_death'])

# define the validation scheme and scorer. The default model is LightGBM
lofo_imp = LOFOImportance(dataset, cv=cv, scoring="roc_auc")

# get the mean and standard deviation of the importances in pandas format
importance_df = lofo_imp.get_importance()

# plot the means and standard deviations of the importances
plot_importance(importance_df, figsize=(12, 20))

In [160]:
importance_df

Unnamed: 0,feature,importance_mean,importance_std
26,gcs_verbal_apache,3.436791e-03,2.002346e-03
181,wbc_apache_flag,3.265577e-03,1.772769e-03
157,d1_arterial_po2_min,2.661732e-03,2.493615e-03
1,age,2.425830e-03,1.940636e-03
154,d1_arterial_ph_max,2.417295e-03,1.930292e-03
89,h1_sysbp_min,2.416166e-03,1.546878e-03
34,pao2_apache,2.391494e-03,1.237917e-03
38,temp_apache,2.195723e-03,1.175118e-03
214,apache_3j_score,2.126502e-03,2.316896e-03
94,d1_albumin_max,2.091905e-03,3.453547e-03
