# DL Survival - Ventilation Outcomes
 Updated 21/11/21

In [None]:
import pandas as pd
import numpy as np
import math
import statistics
from datetime import datetime
import datetime as dt
from datetime import timedelta
import json
import miceforest as mf

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

## 1. Data cleaning

- Import MIMIC III data
- Review column unique values, assign correct data types
- Impute missing values


### 1.1: Importing data

In [None]:
df = pd.read_csv('mimic_combined.csv')

In [None]:
pd.set_option('display.max_columns', None)
print(df.shape)
df.head(10)

#### 1.1.1: Column lists

In [None]:
#view and reorder columns
cols = list(df.columns)
new_cols = ['Unnamed: 0','hadm_id','subject_id','gender','ethnicity','marital_status','insurance','language','aortic','mit','tricuspid',
            'pulmonary','cabg','temp','bg_temp','hr','spo2','rr','sbp','dbp','meanbp','weight','height','cardiac_index','pt','ptt',
            'inr','inr_1','fibrinogen','hb','hematocrit','plts','wcc','lymphocytes','neutrophils','alp','ast','alt','ggt',
            'bilirubin_indirect','bilirubin_direct','bilirubin_total','chloride','magnesium','potassium','crp','bleed_time','albumin',
            'creatinine','free_calcium','sodium','bicarb','bun','hba1c','glucose','lactate','po2','pco2','baseexcess','ph','aado2',
            'fio2','ffp','insulin','cryo','prbc','infection','ventrate','tidalvol','vent_array','reintubation','liver_severe','liver_mild',
            'rheum','cvd','aids','ckd','copd','arrhythmia','pud','smoking','pvd','paraplegia','ccf','met_ca','t2dm','t1dm','malig','mi',
            'dementia','first_careunit','last_careunit','admission_location','admission_type','hospital_expire_flag','admittime',
            'dischtime','intime','outtime','ext_time','reint_time','los','icustay_seq','deathtime','plt','diab_un','diab_cc',
            'dtoutput','specimen','dod']

ptinfo=['Unnamed:0','hadm_id','subject_id']

demographics=['gender','ethnicity','marital_status','insurance','language']

proceduretype=['aortic','mit','tricuspid','pulmonary','cabg']

vitals=['temp','bg_temp','hr','spo2','rr','sbp','dbp','meanbp','weight','height','cardiac_index']

labs=['pt','ptt','inr','inr_1','fibrinogen','hb','hematocrit','plts','wcc','lymphocytes','neutrophils','alp','ast','alt','ggt',
'bilirubin_indirect','bilirubin_direct','bilirubin_total','chloride','magnesium','potassium','crp','bleed_time',
'albumin','creatinine','free_calcium','sodium','bicarb','bun','hba1c','glucose','lactate']

bloodgases=['po2','pco2','baseexcess','ph','aado2','fio2']

products=['ffp','insulin','cryo','prbc','infection']

ventilation=['ventrate','tidalvol','vent_array','reintubation']

comorbidities=['liver_severe','liver_mild','rheum','cvd','aids','ckd','copd','arrhythmia','pud','smoking','pvd',
'paraplegia','ccf','met_ca','t2dm','t1dm','malig','mi','dementia']

adm_cat=['first_careunit','last_careunit','admission_location','admission_type','hospital_expire_flag']

adm_num=['admittime','dischtime','intime','outtime','ext_time','reint_time','los','icustay_seq','deathtime']

others=['plt','diab_un','diab_cc','dtoutput','specimen','dod']

timeseries=[*vitals,*labs,*bloodgases,*products,*ventilation,'plt','dtoutput']
timeseries = [i for i in timeseries if i not in ('weight','height','reintubation', 'infection', 'vent_array')]
    
timeseries_valuenames = {'cardiac_index':'ci',
                         'plts':'bloodproduct',
                         'ffp':'bloodproduct',
                         'insulin':'amount',
                         'cryo':'bloodproduct',
                         'prbc':'bloodproduct',
                         'dtoutput':'output'}

In [None]:
df = df[new_cols]
df.head(10)

### 1.2: Cleaning data types

#### 1.2.0: NaN assignment

In [None]:
df = df.replace('NaT',np.datetime64('NaT'))
df = df.replace(['[]','NaN',np.datetime64('NaT')],np.NaN)

#### 1.2.1: Datetime columns
+ add vent_duration column

In [None]:
# set column types as datetime
time_cols = ['admittime','dischtime','intime','outtime','reint_time','ext_time','deathtime']
for col in time_cols:
    df[col] = pd.to_datetime(df[col], format='%Y-%m-%d %H:%M:%S')

#dod
df['dod'] = pd.to_datetime(df['dod'], format='%Y-%m-%d')

In [None]:
# define function for getting ventilation duration (1st ventilation)


### NOTE: NEED TO EDIT FORMULA FOR VENT DURATION BASED ON JAHAN'S FORMULA ###


def get_vent_duration(row):
    time_s = (row['ext_time']-row['intime']).total_seconds()
    if math.isnan(time_s):
        time_s = (row['deathtime']-row['intime']).total_seconds()
    time_min = time_s / 60
    time_h = time_min / 60
    return time_h

In [None]:
# create ~NEW COLUMN~ for vent_duration
df['vent_duration'] = df.apply(get_vent_duration, axis=1)

In [None]:
## CHECK FOR ROWS WHERE DEATHTIME < INTIME OR ADMITTIME
xtime_cols = ['ext_time','vent_duration','admittime','dischtime','intime','outtime','reint_time','deathtime','dod']
df.loc[df['vent_duration'] < 0][xtime_cols]

In [None]:
df[xtime_cols].dtypes

#### 1.2.2: Demographics

In [None]:
for x in demographics:
    print(x,': ',df[x].unique())

In [None]:
#ethnicity
df.replace({'ethnicity':
                {'unknown': np.NaN,'UNKNOWN':np.NaN,'UNABLE TO OBTAIN':np.NaN,
                'OTHER':'other','WHITE':'white','BLACK/AFRICAN AMERICAN':'black','ASIAN':'asian',
                'HISPANIC/LATINO':'hispanic','AMERICAN INDIAN/ALASKA NATIVE':'native'
                }
            }, 
            inplace=True)
print(df['ethnicity'].unique())

In [None]:
#marital_status
df.replace({'marital_status':
                {'UNKNOWN (DEFAULT)': np.NaN
                }
            }, 
            inplace=True)
print(df['marital_status'].unique())

In [None]:
#language
df.replace({'language':
                {'ENGLISH':'ENGL','?':np.NaN
                }
            }, 
            inplace=True)
print(df['marital_status'].unique())

#### 1.2.3: ✔Procedure type

In [None]:
for x in proceduretype:
    print(x,': ',df[x].unique())

#### 1.2.4: **Vitals / Blood Gases / Products + infection / Ventilation


In [None]:
# wait for Jahan/others
# ventrate seems to be empty

#### 1.2.5: ✔Comorbidities

In [None]:
for x in comorbidities:
    print(x,': ',df[x].unique())

#### 1.2.6: Admissions (categorical)

In [None]:
for x in adm_cat:
    print(x,': ',df[x].unique())

In [None]:
#first_careunit
df.replace({'first_careunit':
                {'Cardiac Vascular Intensive Care Unit (CVICU)':'CVICU',
                'Coronary Care Unit (CCU)':'CCU',
                'Medical Intensive Care Unit (MICU)':'MICU',
                'Surgical Intensive Care Unit (SICU)':'SICU',
                'Neuro Intermediate':'Neuro Inter',
                'Medical/Surgical Intensive Care Unit (MICU/SICU)':'MICU/SICU',
                'Trauma SICU (TSICU)':'TSICU',
                'Neuro Surgical Intensive Care Unit (Neuro SICU)':'Neuro SICU'
                }
            }, 
            inplace=True)
print(df['first_careunit'].unique())

In [None]:
#last_careunit
df.replace({'last_careunit':
                {'Cardiac Vascular Intensive Care Unit (CVICU)':'CVICU',
                'Coronary Care Unit (CCU)':'CCU',
                'Medical Intensive Care Unit (MICU)':'MICU',
                'Surgical Intensive Care Unit (SICU)':'SICU',
                'Neuro Intermediate':'Neuro Inter',
                'Medical/Surgical Intensive Care Unit (MICU/SICU)':'MICU/SICU',
                'Trauma SICU (TSICU)':'TSICU',
                'Neuro Surgical Intensive Care Unit (Neuro SICU)':'Neuro SICU'
                }
            }, 
            inplace=True)
print(df['last_careunit'].unique())

In [None]:
#admission_location
df.replace({'admission_location':
                {'TRANSFER FROM HOSP/EXTRAM':'TRANSFER FROM HOSPITAL',
                'PHYS REFERRAL/NORMAL DELI':'PHYSICIAN REFERRAL',
                'TRANSFER FROM SKILLED NUR':'TRANSFER FROM SKILLED NURSING FACILITY',
                'INFORMATION NOT AVAILABLE':np.NaN,
                'CLINIC REFERRAL':'CLINIC REFERRAL/PREMATURE',
                'EMERGENCY ROOM ADMIT':'EMERGENCY ROOM',
                }
            }, 
            inplace=True)
print(df['admission_location'].unique())

#### 1.2.7: Others

In [None]:
# for x in others:
#     print(x,': ',df[x].unique())

### Parsing time series data

In [None]:
# df['vent_array'][0]

In [None]:
# def ventarray_parser(value):
#     int_time1=np.NaN
#     ext_time1=np.NaN
#     int_time2=np.NaN
#     ext_time2=np.NaN
#     if value == np.NaN:
#         return np
#     else:
#         a = value
#         for i in ['\n ',
#                   '[',']',
#                   "{'starttime': datetime.datetime",
#                   " 'endtime': datetime.datetime",
#                   " 'duration_hours': "]:
#             a = a.replace(i,'')
#         split = a.split('}')
#         del split[-1]
#         int_time1=np.NaN
#         ext_time1=np.NaN
#         int_time2=np.NaN
#         ext_time2=np.NaN
#         if len(split) == 1:
#             pass
#         elif len(split) == 2:
#             pass
#         else:
#             raise ValueError("length of vent_array is wonky")

# test_x = df['vent_array'][13]
# ventarray_parser(test_x)

In [None]:
def infection_parser(value, timelimit):
    if value == np.NaN:
        return np.NaN
    else:
        a = value
        a = a.replace('\n ','')
        a = a.replace('[','')
        a = a.replace(']','')
        a = a.replace("{'charttime': datetime.datetime",'')
        split = a.split('}')

In [None]:
def ts_parser(value, timelimit):
    """
    Takes single string of timeseries data in MIMIC format and returns the mean, max, min values   
    Parameters
    ----------
    value : single string of timeseries data in MIMIC format
    timelimit : time (in hours) from the first data entry to include data up to

    Returns
    -------
    avg : mean of all values within specified time period
    max_: maximum of all values within specified time period
    min_: minimum of all values within specified time period
    """
    if value == np.NaN:
        return np.NaN, np.NaN, np.NaN
    else:
        a = value
        a = a.replace('\n ','')
        a = a.replace('[','')
        a = a.replace(']','')
        a = a.replace("{'charttime': datetime.datetime",'')
        split = a.split('}')
        del split[-1]
        times = []
        values = []
        for n in range(0,len(split)):
            subsplit = split[n].split(", 'value'")
            t = datetime.strptime(subsplit[0],'(%Y, %m, %d, %H, %M)')
            times.append(t)
            v = float(subsplit[1].replace(': ',''))
            values.append(v)
        starttime = times[0]
        endtime = times[0] + timedelta(hours=timelimit)
        #find the average
        incl_values = []
        for n in range(0,len(split)):
            if times[n] > starttime and times[n] < endtime: 
                incl_values.append(values[n])
        print(incl_values)
        avg = statistics.mean(incl_values)
        max_ = max(incl_values)
        min_ = min(incl_values)
        return avg, max_, min_

def ts_parser2(value, timeDelta=None, timeLimits=None, valuename='value'):
    # timeDelta is timedelta in hours from earliest entry
    # timeLimits = (startTime, endTime)
    # if both timeDelta and timeLimits are provided, timeDelta overrules.
    # if both are None, then all timepoints are accepted
    
    if value == np.NaN or pd.isna(value):
        return np.NaN, np.NaN, np.NaN
    
    a = value.replace("'", '"')
    a = a.replace('\n ...\n',',').replace('\n', ',').replace('...', '')
    a = a.replace('datetime.', '"dt.')
    a = a.replace(f'), "{valuename}"', f')", "{valuename}"')
    a = a.replace('"unit": None', '"unit": "None"')
    a = a.replace('starttime', 'charttime')
    a = json.loads(a)
    b = [(eval(i['charttime']), i[valuename]) for i in a]
    
    if timeDelta:
        startTime = min(b, key=lambda x:x[0])[0]
        inc_b = [i[1] for i in b if i[0] <= startTime + dt.timedelta(hours=timeDelta)]
    else:
        if timeLimits:
            inc_b = [i[1] for i in b if i[0] >= timeLimits[0] and i[0] <= timeLimits[1]]
        else:
            inc_b = [i[1] for i in b]
    return sum(inc_b) / len(inc_b), max(inc_b), min(inc_b)

test_x = df[timeseries].iloc[0,0]
print(ts_parser(test_x,12))
print(ts_parser2(test_x, timeDelta=12))
print()
test_y = df['bg_temp'][9]
print(test_y)
print('Parser1: ', ts_parser(test_y, 36))
print('Parser2: ', ts_parser2(test_y, timeDelta=36))

### 1.3: Handling missing data

#### 1.3.0 Assessing for missing data

In [None]:
# formula for checking % missing values
def missing_values_table(df): 
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(columns = {0: 'Missing Values', 1: '% Missing Values'})
    return mis_val_table_ren_columns

missing_data = missing_values_table(df)

In [None]:
#set limit and get list of variables missing above limit in `missing_cols`
missing_limit = 20
missing_cols = missing_data.loc[missing_data['% Missing Values']>missing_limit].index.tolist()
print(missing_cols)
missing_data.loc[missing_data['% Missing Values']>missing_limit]

In [None]:
missing_data.loc[time_cols,:]

In [None]:
# option 1: delete all rows in `missing_cols` (set inplace to true to execute)

df.drop(columns=missing_cols, inplace=False)
print(list(df.columns))

# reset index
#df.reset_index(drop=True, inplace=True)

In [None]:
# option 2: impute data based on median


In [None]:
# option 3: multiple imputation

x = missing_data.loc[missing_data['% Missing Values']> 0]
x.loc[[i for i in x.index if i not in time_cols],:]

#### 1.3.1 Creating summary fields for time-series data

In [None]:
# checking that ts_parser2() works for the timeseries columns

# for j in timeseries:
#     for i in range(len(df[j])):
#         try:
#             if j in timeseries_valuenames:
#                 ts_parser2(df[j][i], timeDelta=36, valuename=timeseries_valuenames[j])
#             else:
#                 ts_parser2(df[j][i], timeDelta=36)
#         except:
#             print(j, i)
#             break
#     print(j, 'Fine')

#### 1.3.2 Beginning imputation

In [None]:
x = missing_data.loc[missing_data['% Missing Values']> 0]

dfForImpute = df[[i for i in list(x.index) if i not in timeseries+['infection', 'vent_array']]]
dfForImpute = df[['ethnicity', 'marital_status', 'language', 'admission_location']]
dfForImpute
for i in ['ethnicity', 'marital_status', 'language', 'admission_location']:
    dfForImpute[i] = dfForImpute[i].astype('category')

# before imputation
dfForImpute

In [None]:
kds = mf.ImputationKernel(
  dfForImpute,
  datasets=1,
  save_all_iterations=True,
  random_state=1991
)

# Run the MICE algorithm for 3 iterations
kds.mice(2)

print(kds)

dfImputed = kds.complete_data(dataset=0, inplace=False)
print(dfImputed.isnull().sum(0))

# after imputation
dfImputed