In [25]:
import pandas as pd
import numpy as np
import math
import statistics
from sksurv.preprocessing import OneHotEncoder
from sklearn.feature_selection import mutual_info_classif

In [2]:
#load data
pd.set_option('display.max_columns', None)
df = pd.read_csv('imputed_revent.csv')
df = df.drop(axis=1, columns=['Unnamed: 0','last_careunit'])

## 1: Post-pre-processing

#### 1.1: Variable grouping

In [10]:
for col in ['ext_time1','int_time2','dischtime']:
    df[col] = pd.to_datetime(df[col])

# get time to reintubation
def get_reventTime(ext_time1=None, int_time2=None, dischtime=None):
    time_to_reint = 0
    if int_time2 == np.NaN or pd.isna(int_time2):
        time_to_reint = np.NaN
        return time_to_reint
    else:
        time_to_reint = int_time2 - ext_time1
        return time_to_reint.total_seconds()/3600

time_to_revent = []
for i in list(range(len(df))):
    y = get_reventTime(ext_time1=df['ext_time1'][i],int_time2=df['int_time2'][i],dischtime=df['dischtime'][i])
    time_to_revent.append(y)

df['time_to_revent'] = time_to_revent


In [11]:
# variables for analysis
categorical = ['ethnicity', 
              'marital_status',
              'language',
              'admission_location',
              'gender',
              'insurance',
              'first_careunit',
              'admission_type']
proceduretype=['aortic','mit','tricuspid','pulmonary','cabg']
ptParams = ['weight', 'height', 'bmi','duration1']
boolFields = ['reintubation', 'liver_severe', 'liver_mild', 'rheum', 'cvd', 'aids', 'ckd', 'copd', 'arrhythmia', 'pud', 'smoking', 'pvd', 'paraplegia', 
              'ccf', 'met_ca', 't2dm', 't1dm', 'malig', 'mi', 'dementia', 'hospital_expire_flag', 'diab_un', 'diab_cc','infection_vent']
ptinfo = ['hadm_id', 'subject_id']
ptTimes = ['int_time1','ext_time1','int_time2','ext_time2','admittime', 'dischtime', 'deathtime','intime', 'outtime', 'ext_time', 'icustay_seq']
tsColumns_last = [i for i in df.columns if '_last' in i]
tsColumns_first = [i for i in df.columns if '_first' in i]
tsColumns_notFirst = [i for i in df.columns if '_max' in i or '_min' in i or '_mean' in i or '_last' in i]

inputs = [*categorical , *proceduretype , *tsColumns_last, *ptParams , *[i for i in boolFields if i != 'reintubation']]
inputs_all = [*categorical , *proceduretype , *tsColumns_first , *tsColumns_notFirst, *ptParams , *[i for i in boolFields if i != 'reintubation']]
outcomes = ['reintubation','hospital_expire_flag','los','duration2','icu_stay_duration','time_to_revent']

print([i for i in df.columns if i not in categorical + proceduretype + tsColumns_first + tsColumns_notFirst + ptParams + boolFields + ptinfo + ptTimes + outcomes])
print([i for i in df.columns if i not in inputs + outcomes + ptTimes + tsColumns_first])

['censor']
['temp_mean', 'temp_max', 'temp_min', 'bg_temp_mean', 'bg_temp_max', 'bg_temp_min', 'hr_mean', 'hr_max', 'hr_min', 'spo2_mean', 'spo2_max', 'spo2_min', 'rr_mean', 'rr_max', 'rr_min', 'sbp_mean', 'sbp_max', 'sbp_min', 'dbp_mean', 'dbp_max', 'dbp_min', 'meanbp_mean', 'meanbp_max', 'meanbp_min', 'cardiac_index_mean', 'cardiac_index_max', 'cardiac_index_min', 'pt_mean', 'pt_max', 'pt_min', 'ptt_mean', 'ptt_max', 'ptt_min', 'inr_mean', 'inr_max', 'inr_min', 'inr_1_mean', 'inr_1_max', 'inr_1_min', 'fibrinogen_mean', 'fibrinogen_max', 'fibrinogen_min', 'hb_mean', 'hb_max', 'hb_min', 'hematocrit_mean', 'hematocrit_max', 'hematocrit_min', 'wcc_mean', 'wcc_max', 'wcc_min', 'lymphocytes_mean', 'lymphocytes_max', 'lymphocytes_min', 'neutrophils_mean', 'neutrophils_max', 'neutrophils_min', 'alp_mean', 'alp_max', 'alp_min', 'ast_mean', 'ast_max', 'ast_min', 'alt_mean', 'alt_max', 'alt_min', 'bilirubin_total_mean', 'bilirubin_total_max', 'bilirubin_total_min', 'chloride_mean', 'chloride_ma

In [12]:
# change categorical to category type
for col in categorical:
    df[col] = df[col].astype('category')

In [13]:
# change dt to dt type
for col in [i for i in ptTimes if i != 'icustay_seq']:
    df[col] = pd.to_datetime(df[col])

In [14]:
# create censor column for death before extubation
censor = []
for i in range(len(df)):
    int_time2=df['int_time2'][i]
    if int_time2 == np.NaN or pd.isna(int_time2):
        #censor.append(False)
        censor.append(True)
    elif pd.isnull(df['deathtime'][i]) or df['deathtime'][i] >= df['ext_time1'][i]:
        censor.append(True)
    else:
        censor.append(False)
censored = []
for i in range(len(censor)):
    if censor[i] == False:
        censored.append(i)
print('Number censored: '+str(len(censored)))
print(censored)

df['censor'] = censor

Number censored: 0
[]


#### 1.2: !!X, Xt, y!!

In [23]:
# get X, Xt and y
outcome = 'duration1'

X = df[inputs]
#X = df[inputs_all]
Xt = OneHotEncoder().fit_transform(X)

y = np.zeros(len(df), dtype={'names':('Censor', 'Survival'),
                          'formats':('?', '<f8')})

y['Censor'] = df['censor']
y['Survival'] = df['time_to_revent']
y = df['reintubation']

## 2: Univariate analysis

In [28]:
# get list of top variables by mutual
how_many_select = 40

mutual_info_df = pd.DataFrame(mutual_info_classif(Xt, y, random_state=42), index=[*list(Xt.columns)])
mutual_info_df = mutual_info_df.sort_values(by=[0])
sel_var = list(mutual_info_df.index)[-how_many_select:]

['admission_location=PHYSICIAN REFERRAL',
 'arrhythmia',
 'hospital_expire_flag',
 'bicarb_last',
 'baseexcess_last',
 'creatinine_last',
 'chloride_last',
 'ccf',
 'bun_last',
 'duration1']