# Development of machine learning models to process Electronic Health Records – Explainable Models

### Imputation Notebook
Lok Hang Toby Lee (2431180L)

# Imputation

----------------------------------------------------------

In [3]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')

# SET YOUR PATH FOR RESOURCES FILE HERE
resources_path = "C:/Users/USER/OneDrive/GU/Year 4/FYP/MIMIC-III-ML/data/resources"
data_path = "C:/Users/USER/OneDrive/GU/Year 4/FYP/MIMIC-III-ML/data/raw"

### Extract the variables and count the missing data

In [4]:
# Read in the data and only select the continuous variables:
config = json.load(open(resources_path + '/discretizer_config.json', 'r'))
is_categorical = config['is_categorical_channel']
categorical_var = [key for key, value in config['is_categorical_channel'].items() if value][1:]
patient_identifiers = ['icustay_id', 'subject_id', 'hadm_id', 'hours_in']
data = pd.read_csv(data_path+'/mimic_timeseries_data_not_imputed.csv')

# Filter out subjects with length of stay < 48 hours for the in-hospital mortality task:
y = pd.read_hdf(data_path+'/vitals_hourly_data_preprocessed.h5', 'Y')['los']
indices_to_remove = []
for i, row in y.iteritems():
    if row < 48:
        indices_to_remove.append(i)
        
data = data.reset_index().set_index('icustay_id').drop(indices_to_remove, axis = 0)
y = y.drop(indices_to_remove, axis = 0)

# Extract only the first 48 hours:
data = data[data['hours_in'] < 48].reset_index().set_index(['icustay_id', 'subject_id', 'hadm_id', 'hours_in']).drop('index', axis = 1)
data = data.drop('Unnamed: 0', axis = 1)


In [5]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Diastolic blood pressure,Fraction inspired oxygen,Glascow coma scale eye opening,Glascow coma scale motor response,Glascow coma scale total,Glascow coma scale verbal response,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
icustay_id,subject_id,hadm_id,hours_in,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
200003,27513,163557,0,49.000000,,4.0,6.0,15.0,5.0,110.0,119.0,,58.000000,97.0,35.000000,91.000000,,77.5,
200003,27513,163557,1,52.000000,,,,,,,118.0,,59.000000,96.0,32.000000,88.333333,38.999999,77.5,
200003,27513,163557,2,52.333333,,,,,,,116.0,,59.666667,95.0,30.333333,85.333333,38.277790,77.5,
200003,27513,163557,3,60.500000,,,,,,,112.0,,65.500000,93.5,32.500000,86.500000,37.777790,77.5,
200003,27513,163557,4,61.000000,,,,,,,108.0,,67.000000,91.5,37.000000,89.500000,36.833318,77.5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,28775,134959,43,82.000000,,,,,,,74.0,,93.000000,,19.000000,145.000000,,,
299995,28775,134959,44,,,,,,,,72.0,,,,20.000000,,,,
299995,28775,134959,45,,,,,,,,82.0,,,,20.000000,,,,
299995,28775,134959,46,65.000000,,4.0,6.0,15.0,5.0,,73.0,,81.000000,,20.000000,128.000000,36.555578,,


In [3]:
print('Number of samples: ', data.shape[0])
print('Number of ICU stays: ', len(data.groupby('icustay_id')))

non_nans_per_predictor = np.array(list(data.groupby('icustay_id').apply(lambda x: np.count_nonzero(~np.isnan(x.values[:, 4:]), 
                                                                                           axis = 0))))
total_missing_predictors = 0
for subject in non_nans_per_predictor:
    for predictor in subject:
        if predictor == 0:
            total_missing_predictors += 1
print('Total number of predictors that are missing at every hour (per subject): ', total_missing_predictors)

Number of samples:  843312
Number of ICU stays:  17569
Total number of predictors that are missing at every hour (per subject):  28794


# Mean and joint imputation

In [14]:
# Initialize matrix to store the results of different imputation methods:
original_data = pd.DataFrame(index = data.index, columns = data.columns)
results_mean_imp = pd.DataFrame(index = data.index, columns = data.columns)
results_joint_imp = pd.DataFrame(index = data.index, columns = data.columns)

In [31]:
from sklearn.impute import KNNImputer

counter = 0
n_folds = 100
unique_icustays = np.unique(data.index.get_level_values('icustay_id'))
kf = KFold(n_splits=n_folds, random_state=0, shuffle=True)
for train_index, test_index in kf.split(unique_icustays):
    
    print('Imputing fold ', counter + 1, '/', n_folds, end = '\r')
    counter += 1
  
    train_index = unique_icustays[train_index]
    test_index = unique_icustays[test_index]

    data_train = data.iloc[data.index.get_level_values('icustay_id').isin(train_index)]
    data_test = data.iloc[data.index.get_level_values('icustay_id').isin(test_index)]

    for i in range(len(data.columns)): # round categorical to closest category
        if data.columns[i] in categorical_var:
            data_test.loc[:, data.columns[i]] = np.round(data_test.loc[:, data.columns[i]])
    original_data.loc[data.index.get_level_values('icustay_id').isin(test_index), :] = data_test
    
    # Replace the test data with NaNs so we can learn the values:
    data_test = np.empty(data_test.shape)
    data_test[:] = np.nan
    
    # Mean imputation:
    mean_imp = np.nanmean(data_train, axis = 0)
    for i in range(len(mean_imp)): # round categorical to closest category
        if data.columns[i] in categorical_var:
            mean_imp[i] = np.round(mean_imp[i])  
    results_mean_imp.loc[data.index.get_level_values('icustay_id').isin(test_index), :] = mean_imp
    
    # Joint imputation: first extract the mu and covariance, then sample from the Gaussian distribution:    
    mu = np.nanmean(data_train.values, axis = 0) #computes the mean along the specified axis ignoring all NaNs.
    data_train_no_nans = data_train.values
    for i in range(data_train.values.shape[1]):
        data_train_no_nans[:, i] = np.nan_to_num(data_train.values[:, i], nan = mu[i])
    data_train_no_nans = data_train_no_nans.T
    sigma = np.cov(data_train_no_nans)
    sample = np.random.multivariate_normal(mu, sigma, size = len(unique_icustays))
    sample = np.mean(sample, axis = 0)
    for i in range(len(sample)): # round categorical to closest category
        if data.columns[i] in categorical_var:
            sample[i] = np.round(sample[i])
    results_joint_imp.loc[data.index.get_level_values('icustay_id').isin(test_index), :] = sample


Imputing fold  100 / 100

# KNN imputation

In [4]:
results_knn_imp = pd.DataFrame(index = data.index, columns = data.columns)


In [33]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(data)
scaled_data= scaler.transform(data)
scaled_data=np.array_split(scaled_data,8)

In [28]:

# KNN imputation
knn_imputed_results=[]
counter=0
for i in range(len(scaled_data)):
    print('Imputing split ', counter + 1, '...')
    counter += 1
    knn_imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
    knn_imp = knn_imputer.fit_transform(scaled_data[i])
    knn_imputed_results.append(knn_imp)


Imputing split  1 ...
Imputing split  2 ...
Imputing split  3 ...
Imputing split  4 ...
Imputing split  5 ...
Imputing split  6 ...
Imputing split  7 ...
Imputing split  8 ...


In [48]:
knn_results=np.concatenate(knn_imputed_results)
knn_results=scaler.inverse_transform(knn_results)
knn_imputed_data = pd.DataFrame(index = data.index, columns = data.columns)
knn_imputed_data.loc[:,:]=knn_results

# Iterative Imputer

In [7]:
tsm_df=pd.read_csv(data_path+'/time_since_measured_df.csv')

# Filter out subjects with length of stay < 48 hours for the in-hospital mortality task:
y = pd.read_hdf(data_path+'/vitals_hourly_data_preprocessed.h5', 'Y')['los']
indices_to_remove = []
for i, row in y.iteritems():
    if row < 48:
        indices_to_remove.append(i)
        
tsm_df = tsm_df.reset_index().set_index('icustay_id').drop(indices_to_remove, axis = 0)
y = y.drop(indices_to_remove, axis = 0)

# Extract only the first 48 hours:
tsm_df = tsm_df[tsm_df['hours_in'] < 48].reset_index().set_index(['icustay_id', 'subject_id', 'hadm_id', 'hours_in']).drop('index', axis = 1)


In [20]:
tsm = tsm_df.iloc[:,:32]

In [15]:
tsm_df.iloc[:,12:]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Diastolic blood pressure.1,Fraction inspired oxygen.1,Glucose.1,Heart Rate.1,Height.1,Mean blood pressure.1,Oxygen saturation.1,Respiratory rate.1,Systolic blood pressure.1,Temperature.1,...,"('Glascow coma scale total', 'mean')_11.0","('Glascow coma scale total', 'mean')_12.0","('Glascow coma scale total', 'mean')_13.0","('Glascow coma scale total', 'mean')_14.0","('Glascow coma scale total', 'mean')_15.0","('Glascow coma scale verbal response', 'mean')_1.0","('Glascow coma scale verbal response', 'mean')_2.0","('Glascow coma scale verbal response', 'mean')_3.0","('Glascow coma scale verbal response', 'mean')_4.0","('Glascow coma scale verbal response', 'mean')_5.0"
icustay_id,subject_id,hadm_id,hours_in,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
200003,27513,163557,0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0,0,0,0,1,0,0,0,0,1
200003,27513,163557,1,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
200003,27513,163557,2,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
200003,27513,163557,3,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
200003,27513,163557,4,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,28775,134959,43,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
299995,28775,134959,44,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
299995,28775,134959,45,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
299995,28775,134959,46,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,...,0,0,0,0,1,0,0,0,0,1


In [17]:
tsm=tsm_df.rename(columns={'Diastolic blood pressure.1':'Diastolic blood pressure(count)', 'Fraction inspired oxygen.1':'Fraction inspired oxygen(count)',
       'Glucose.1':'Glucose(count)', 'Heart Rate.1':'Heart Rate(count)', 'Height.1':'Height(count)', 'Mean blood pressure.1':'Mean blood pressure(count)',
       'Oxygen saturation.1':'Oxygen saturation(count)', 'Respiratory rate.1':'Respiratory rate(count)',
       'Systolic blood pressure.1':'Systolic blood pressure(count)', 'Temperature.1':'Temperature(count)', 'Weight.1':'Weight(count)', 'pH.1':'pH(count)',
       'Diastolic blood pressure.2':'Diastolic blood pressure(t_s_m)', 'Fraction inspired oxygen.2':'Fraction inspired oxygen(t_s_m)', 'Glucose.2':'Glucose(t_s_m)',
       'Heart Rate.2':'Heart Rate(t_s_m)', 'Height.2':'Height(t_s_m)', 'Mean blood pressure.2':'Mean blood pressure(t_s_m)',
       'Oxygen saturation.2':'Oxygen saturation(t_s_m)', 'Respiratory rate.2':'Respiratory rate(t_s_m)',
       'Systolic blood pressure.2':'Systolic blood pressure(t_s_m)', 'Temperature.2':'Temperature(t_s_m)', 'Weight.2':'Weight(t_s_m)', 'pH.2':'pH(t_s_m)',
       "('Glascow coma scale eye opening', 'mean')_1.0":'Glascow coma scale eye opening 1',
       "('Glascow coma scale eye opening', 'mean')_2.0":'Glascow coma scale eye opening 2',
       "('Glascow coma scale eye opening', 'mean')_3.0":'Glascow coma scale eye opening 3',
       "('Glascow coma scale eye opening', 'mean')_4.0":'Glascow coma scale eye opening 4',
       "('Glascow coma scale motor response', 'mean')_1.0":'Glascow coma scale motor response 1',
       "('Glascow coma scale motor response', 'mean')_2.0":'Glascow coma scale motor response 2',
       "('Glascow coma scale motor response', 'mean')_3.0":'Glascow coma scale motor response 3',
       "('Glascow coma scale motor response', 'mean')_4.0":'Glascow coma scale motor response 4',
       "('Glascow coma scale motor response', 'mean')_5.0":'Glascow coma scale motor response 5',
       "('Glascow coma scale motor response', 'mean')_6.0":'Glascow coma scale motor response 6',
       "('Glascow coma scale total', 'mean')_3.0":'Glascow coma scale total 3',
       "('Glascow coma scale total', 'mean')_4.0":'Glascow coma scale total 4',
       "('Glascow coma scale total', 'mean')_5.0":'Glascow coma scale total 5',
       "('Glascow coma scale total', 'mean')_6.0":'Glascow coma scale total 6',
       "('Glascow coma scale total', 'mean')_7.0":'Glascow coma scale total 7',
       "('Glascow coma scale total', 'mean')_8.0":'Glascow coma scale total 8',
       "('Glascow coma scale total', 'mean')_9.0":'Glascow coma scale total 9',
       "('Glascow coma scale total', 'mean')_10.0":'Glascow coma scale total 10',
       "('Glascow coma scale total', 'mean')_11.0":'Glascow coma scale total 11',
       "('Glascow coma scale total', 'mean')_12.0":'Glascow coma scale total 12',
       "('Glascow coma scale total', 'mean')_13.0":'Glascow coma scale total 13',
       "('Glascow coma scale total', 'mean')_14.0":'Glascow coma scale total 14',
       "('Glascow coma scale total', 'mean')_15.0":'Glascow coma scale total 15',
       "('Glascow coma scale verbal response', 'mean')_1.0":'Glascow coma scale verbal response 1',
       "('Glascow coma scale verbal response', 'mean')_2.0":'Glascow coma scale verbal response 2',
       "('Glascow coma scale verbal response', 'mean')_3.0":'Glascow coma scale verbal response 3',
       "('Glascow coma scale verbal response', 'mean')_4.0":'Glascow coma scale verbal response 4',
       "('Glascow coma scale verbal response', 'mean')_5.0":'Glascow coma scale verbal response 5'})

In [13]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()

scaler.fit(tsm_df)
scaled_data= scaler.transform(tsm_df)

itr_imputer = IterativeImputer(random_state=0,verbose=True)

itr_imp = itr_imputer.fit_transform(scaled_data)

[IterativeImputer] Completing matrix with shape (843312, 64)
[IterativeImputer] Change: 42.618441914612625, scaled tolerance: 0.05763356915289005 
[IterativeImputer] Change: 129.07224634828677, scaled tolerance: 0.05763356915289005 
[IterativeImputer] Change: 227.32945929363666, scaled tolerance: 0.05763356915289005 
[IterativeImputer] Change: 96.28879045546097, scaled tolerance: 0.05763356915289005 
[IterativeImputer] Change: 15.47650473257004, scaled tolerance: 0.05763356915289005 
[IterativeImputer] Change: 14.584389105478332, scaled tolerance: 0.05763356915289005 
[IterativeImputer] Change: 14.80936337784411, scaled tolerance: 0.05763356915289005 
[IterativeImputer] Change: 14.949591698306985, scaled tolerance: 0.05763356915289005 
[IterativeImputer] Change: 14.967719615646843, scaled tolerance: 0.05763356915289005 
[IterativeImputer] Change: 14.87355492915208, scaled tolerance: 0.05763356915289005 


In [16]:
itr_results=scaler.inverse_transform(itr_imp)
itr_imp_data = pd.DataFrame(data=itr_results, index= tsm_df.index, columns = tsm_df.columns)

In [17]:
itr_imp_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Diastolic blood pressure,Fraction inspired oxygen,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,...,"('Glascow coma scale total', 'mean')_11.0","('Glascow coma scale total', 'mean')_12.0","('Glascow coma scale total', 'mean')_13.0","('Glascow coma scale total', 'mean')_14.0","('Glascow coma scale total', 'mean')_15.0","('Glascow coma scale verbal response', 'mean')_1.0","('Glascow coma scale verbal response', 'mean')_2.0","('Glascow coma scale verbal response', 'mean')_3.0","('Glascow coma scale verbal response', 'mean')_4.0","('Glascow coma scale verbal response', 'mean')_5.0"
icustay_id,subject_id,hadm_id,hours_in,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
200003,27513,163557,0,49.000000,0.412694,110.000000,119.000000,162.721725,58.000000,97.000000,35.000000,91.000000,33.496370,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
200003,27513,163557,1,52.000000,0.476682,88.666190,118.000000,171.761069,59.000000,96.000000,32.000000,88.333333,38.999999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200003,27513,163557,2,52.333333,0.542221,103.896852,116.000000,171.004038,59.666667,95.000000,30.333333,85.333333,38.277790,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200003,27513,163557,3,60.500000,0.593867,114.005613,112.000000,170.328094,65.500000,93.500000,32.500000,86.500000,37.777790,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200003,27513,163557,4,61.000000,0.692990,141.323924,108.000000,168.004980,67.000000,91.500000,37.000000,89.500000,36.833318,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,28775,134959,43,82.000000,0.578007,130.282197,74.000000,171.336958,93.000000,91.565013,19.000000,145.000000,36.720916,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
299995,28775,134959,44,61.758583,0.624399,135.841049,72.000000,170.457587,77.962389,89.611095,20.000000,117.730907,36.587608,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
299995,28775,134959,45,59.786059,0.644466,136.800846,82.000000,170.319004,68.480130,87.918774,20.000000,123.200560,36.651635,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
299995,28775,134959,46,65.000000,0.504469,127.863860,73.000000,174.085858,81.000000,87.639200,20.000000,128.000000,36.555578,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [None]:
itr_imp_data.to_csv(data_path+'/imputed_data/itr_imputed(tsm).csv')

# MICE

In [18]:
import miceforest as mf

mf.ampute_data(data,perc=0.25)

# Create kernels. 
kernel = mf.ImputationKernel(
  data=tsm,
  datasets=4,
  save_all_iterations=True,
  random_state=1991
)

# Run the MICE algorithm for 3 iterations on each of the datasets
kernel.mice(3,verbose=True)

Dataset 0
1  | Heart Rate | Respiratory rate | Systolic blood pressure | Diastolic blood pressure | Mean blood pressure | Oxygen saturation | Weight | Temperature | Glucose | pH | Fraction inspired oxygen | Height
2  | Heart Rate | Respiratory rate | Systolic blood pressure | Diastolic blood pressure | Mean blood pressure | Oxygen saturation | Weight | Temperature | Glucose | pH | Fraction inspired oxygen | Height
3  | Heart Rate | Respiratory rate | Systolic blood pressure | Diastolic blood pressure | Mean blood pressure | Oxygen saturation | Weight | Temperature | Glucose | pH | Fraction inspired oxygen | Height
Dataset 1
1  | Heart Rate | Respiratory rate | Systolic blood pressure | Diastolic blood pressure | Mean blood pressure | Oxygen saturation | Weight | Temperature | Glucose | pH | Fraction inspired oxygen | Height
2  | Heart Rate | Respiratory rate | Systolic blood pressure | Diastolic blood pressure | Mean blood pressure | Oxygen saturation | Weight | Temperature | Glucose |

In [19]:
mice_imputed_data = kernel.complete_data(2)

In [20]:
mice_imputed_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Diastolic blood pressure,Fraction inspired oxygen,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,...,Glascow coma scale total 11,Glascow coma scale total 12,Glascow coma scale total 13,Glascow coma scale total 14,Glascow coma scale total 15,Glascow coma scale verbal response 1,Glascow coma scale verbal response 2,Glascow coma scale verbal response 3,Glascow coma scale verbal response 4,Glascow coma scale verbal response 5
icustay_id,subject_id,hadm_id,hours_in,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
200003,27513,163557,0,49.000000,1.0,110.0,119.0,68.79,58.000000,97.0,35.000000,91.000000,36.000001,...,0,0,0,0,1,0,0,0,0,1
200003,27513,163557,1,52.000000,1.0,179.0,118.0,152.20,59.000000,96.0,32.000000,88.333333,38.999999,...,0,0,0,0,0,0,0,0,0,0
200003,27513,163557,2,52.333333,0.5,84.0,116.0,154.97,59.666667,95.0,30.333333,85.333333,38.277790,...,0,0,0,0,0,0,0,0,0,0
200003,27513,163557,3,60.500000,1.0,96.0,112.0,154.97,65.500000,93.5,32.500000,86.500000,37.777790,...,0,0,0,0,0,0,0,0,0,0
200003,27513,163557,4,61.000000,1.0,137.0,108.0,162.78,67.000000,91.5,37.000000,89.500000,36.833318,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,28775,134959,43,82.000000,0.4,114.0,74.0,193.02,93.000000,96.5,19.000000,145.000000,37.166683,...,0,0,0,0,0,0,0,0,0,0
299995,28775,134959,44,49.000000,0.5,87.0,72.0,170.09,68.000000,98.0,20.000000,131.000000,36.166682,...,0,0,0,0,0,0,0,0,0,0
299995,28775,134959,45,52.000000,0.5,169.0,82.0,152.20,69.000000,100.0,20.000000,103.000000,36.166682,...,0,0,0,0,0,0,0,0,0,0
299995,28775,134959,46,65.000000,0.5,114.0,73.0,180.17,81.000000,100.0,20.000000,128.000000,36.555578,...,0,0,0,0,1,0,0,0,0,1


### Take mean of dataset and round catagorial variables

In [21]:
completed_dataset1 = kernel.complete_data(dataset=0, inplace=False)
completed_dataset2 = kernel.complete_data(dataset=1, inplace=False)
completed_dataset3 = kernel.complete_data(dataset=2, inplace=False)
completed_dataset4 = kernel.complete_data(dataset=3, inplace=False)

mice_imputed_results= (completed_dataset1 + completed_dataset2 + completed_dataset3+ completed_dataset4)/4
# for i in range(len(mice_imputed_results.columns)): # round categorical to closest category
#     if mice_imputed_results.columns[i] in categorical_var:
#         mice_imputed_results[mice_imputed_results.columns[i]] = np.round(mice_imputed_results[mice_imputed_results.columns[i]])  

In [22]:
mice_imputed_results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Diastolic blood pressure,Fraction inspired oxygen,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,...,Glascow coma scale total 11,Glascow coma scale total 12,Glascow coma scale total 13,Glascow coma scale total 14,Glascow coma scale total 15,Glascow coma scale verbal response 1,Glascow coma scale verbal response 2,Glascow coma scale verbal response 3,Glascow coma scale verbal response 4,Glascow coma scale verbal response 5
icustay_id,subject_id,hadm_id,hours_in,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
200003,27513,163557,0,49.000000,0.8750,110.00,119.00,53.2950,58.000000,97.000,35.000000,91.000000,37.291671,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
200003,27513,163557,1,52.000000,0.7250,136.50,118.00,149.8050,59.000000,96.000,32.000000,88.333333,38.999999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200003,27513,163557,2,52.333333,0.6000,131.00,116.00,158.0575,59.666667,95.000,30.333333,85.333333,38.277790,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200003,27513,163557,3,60.500000,0.7250,207.75,112.00,158.0575,65.500000,93.500,32.500000,86.500000,37.777790,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200003,27513,163557,4,61.000000,0.7750,151.75,108.00,169.6475,67.000000,91.500,37.000000,89.500000,36.833318,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,28775,134959,43,82.000000,0.4250,170.50,74.00,179.0350,93.000000,98.375,19.000000,145.000000,36.722226,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
299995,28775,134959,44,54.500000,0.4500,150.90,72.00,165.1750,78.250000,98.000,20.000000,136.500000,36.627782,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
299995,28775,134959,45,60.250000,0.6250,110.75,82.00,165.6175,77.750000,96.250,20.000000,120.750000,36.972236,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
299995,28775,134959,46,65.000000,0.7375,138.50,73.00,170.7825,81.000000,97.625,20.000000,128.000000,36.555578,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [23]:
mice_imputed_results.to_csv(data_path+'/imputed_data/mice_imputed_v4.csv')

In [31]:
mice_imputed_results[tsm_df.iloc[:,32:].columns]= tsm_df.iloc[:,32:]

In [32]:
mice_imputed_results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Diastolic blood pressure,Fraction inspired oxygen,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,...,"('Glascow coma scale total', 'mean')_11.0","('Glascow coma scale total', 'mean')_12.0","('Glascow coma scale total', 'mean')_13.0","('Glascow coma scale total', 'mean')_14.0","('Glascow coma scale total', 'mean')_15.0","('Glascow coma scale verbal response', 'mean')_1.0","('Glascow coma scale verbal response', 'mean')_2.0","('Glascow coma scale verbal response', 'mean')_3.0","('Glascow coma scale verbal response', 'mean')_4.0","('Glascow coma scale verbal response', 'mean')_5.0"
icustay_id,subject_id,hadm_id,hours_in,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
200003,27513,163557,0,49.000000,0.68750,110.000000,119.0,155.66250,58.000000,97.000,35.000000,91.000000,37.275000,...,0,0,0,0,1,0,0,0,0,1
200003,27513,163557,1,52.000000,0.70000,128.000000,118.0,164.07375,59.000000,96.000,32.000000,88.333333,38.999999,...,0,0,0,0,0,0,0,0,0,0
200003,27513,163557,2,52.333333,0.60000,142.125000,116.0,165.74250,59.666667,95.000,30.333333,85.333333,38.277790,...,0,0,0,0,0,0,0,0,0,0
200003,27513,163557,3,60.500000,0.62500,134.333333,112.0,163.79000,65.500000,93.500,32.500000,86.500000,37.777790,...,0,0,0,0,0,0,0,0,0,0
200003,27513,163557,4,61.000000,0.65000,168.333333,108.0,159.31750,67.000000,91.500,37.000000,89.500000,36.833318,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,28775,134959,43,82.000000,0.65000,133.708333,74.0,175.82250,93.000000,96.500,19.000000,145.000000,36.308336,...,0,0,0,0,0,0,0,0,0,0
299995,28775,134959,44,61.062500,0.53750,157.500000,72.0,171.60000,77.750000,89.375,20.000000,126.250000,36.749993,...,0,0,0,0,0,0,0,0,0,0
299995,28775,134959,45,54.375000,0.70000,146.000000,82.0,172.73500,69.500000,94.750,20.000000,112.250000,37.444448,...,0,0,0,0,0,0,0,0,0,0
299995,28775,134959,46,65.000000,0.61875,134.083333,73.0,175.94750,81.000000,97.625,20.000000,128.000000,36.555578,...,0,0,0,0,1,0,0,0,0,1


In [54]:
mean_imputed_data = original_data.fillna(results_mean_imp)
joint_imputed_data = original_data.fillna(results_joint_imp)

mean_imputed_data.to_csv(data_path+'/imputed_data/mean_imputed.csv')
joint_imputed_data.to_csv(data_path+'/imputed_data/joint_imputed.csv')
mice_imputed_data.to_csv(data_path+'/imputed_data/mice_imputed.csv')
knn_imputed_data.to_csv(data_path+'/imputed_data/knn_imputed.csv')
itr_imp_data.to_csv(data_path+'/imputed_data/itr_imputed.csv')

# Multiple Imputation using Denoising Autoencoders (MIDAs)

In [1]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import tensorflow as tf
import MIDASpy as md

In [34]:
tsm.columns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Diastolic blood pressure,Fraction inspired oxygen,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,...,Glascow coma scale total 11,Glascow coma scale total 12,Glascow coma scale total 13,Glascow coma scale total 14,Glascow coma scale total 15,Glascow coma scale verbal response 1,Glascow coma scale verbal response 2,Glascow coma scale verbal response 3,Glascow coma scale verbal response 4,Glascow coma scale verbal response 5
icustay_id,subject_id,hadm_id,hours_in,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
200003,27513,163557,0,49.000000,,110.0,119.0,,58.000000,97.0,35.000000,91.000000,,...,0,0,0,0,1,0,0,0,0,1
200003,27513,163557,1,52.000000,,,118.0,,59.000000,96.0,32.000000,88.333333,38.999999,...,0,0,0,0,0,0,0,0,0,0
200003,27513,163557,2,52.333333,,,116.0,,59.666667,95.0,30.333333,85.333333,38.277790,...,0,0,0,0,0,0,0,0,0,0
200003,27513,163557,3,60.500000,,,112.0,,65.500000,93.5,32.500000,86.500000,37.777790,...,0,0,0,0,0,0,0,0,0,0
200003,27513,163557,4,61.000000,,,108.0,,67.000000,91.5,37.000000,89.500000,36.833318,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,28775,134959,43,82.000000,,,74.0,,93.000000,,19.000000,145.000000,,...,0,0,0,0,0,0,0,0,0,0
299995,28775,134959,44,,,,72.0,,,,20.000000,,,...,0,0,0,0,0,0,0,0,0,0
299995,28775,134959,45,,,,82.0,,,,20.000000,,,...,0,0,0,0,0,0,0,0,0,0
299995,28775,134959,46,65.000000,,,73.0,,81.000000,,20.000000,128.000000,36.555578,...,0,0,0,0,1,0,0,0,0,1


In [24]:
data_0=tsm.reset_index()
categorical  = data_0.iloc[:,40:].columns
data_cat, cat_cols_list = md.cat_conv(data_0[categorical])

data_0.drop(categorical, axis = 1, inplace = True)
constructor_list = [data_0]
constructor_list.append(data_cat)
data_in = pd.concat(constructor_list, axis=1)

na_loc = data_in.isnull()
data_in[na_loc] = np.nan



In [25]:
data_in 

Unnamed: 0,icustay_id,subject_id,hadm_id,hours_in,Diastolic blood pressure,Fraction inspired oxygen,Glucose,Heart Rate,Height,Mean blood pressure,...,Glascow coma scale verbal response 1_0,Glascow coma scale verbal response 1_1,Glascow coma scale verbal response 2_0,Glascow coma scale verbal response 2_1,Glascow coma scale verbal response 3_0,Glascow coma scale verbal response 3_1,Glascow coma scale verbal response 4_0,Glascow coma scale verbal response 4_1,Glascow coma scale verbal response 5_0,Glascow coma scale verbal response 5_1
0,200003,27513,163557,0,49.000000,,110.0,119.0,,58.000000,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,200003,27513,163557,1,52.000000,,,118.0,,59.000000,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,200003,27513,163557,2,52.333333,,,116.0,,59.666667,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,200003,27513,163557,3,60.500000,,,112.0,,65.500000,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,200003,27513,163557,4,61.000000,,,108.0,,67.000000,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
843307,299995,28775,134959,43,82.000000,,,74.0,,93.000000,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
843308,299995,28775,134959,44,,,,72.0,,,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
843309,299995,28775,134959,45,,,,82.0,,,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
843310,299995,28775,134959,46,65.000000,,,73.0,,81.000000,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [30]:
imputer = md.Midas(layer_structure = [256,256], vae_layer = False, seed = 2000, input_drop = 0.85)
imputer.build_model(tsm)
imputer.train_model(training_epochs = 10)

Size index: [64]

Computation graph constructed

Model initialised

Epoch: 0 , loss: 428.55945339503603
Epoch: 1 , loss: 363.57590302826276
Epoch: 2 , loss: 352.79708886159665
Epoch: 3 , loss: 347.455071416367
Epoch: 4 , loss: 343.36183631212737
Epoch: 5 , loss: 340.36795660613814
Epoch: 6 , loss: 338.5040181715256
Epoch: 7 , loss: 337.5078083219052
Epoch: 8 , loss: 336.4161469683554
Epoch: 9 , loss: 335.56696404476577
Training complete. Saving file...
Model saved in file: tmp/MIDAS


<MIDASpy.midas_base.Midas at 0x19d8b2b78c8>

In [33]:
imputations = imputer.generate_samples(m=10).output_list 
imputations=[imputations]

results=imputations

INFO:tensorflow:Restoring parameters from tmp/MIDAS
Model restored.


ValueError: cannot join with no overlapping index names

In [None]:
imputations=imputations[0]

In [None]:
flat_cats = [cat for variable in cat_cols_list for cat in variable]

for i in range(len(imputations)):
    tmp_cat = [imputations[i][x].idxmax(axis=1) for x in cat_cols_list]
    cat_df = pd.DataFrame({categorical[i]:tmp_cat[i] for i in range(len(categorical))})
    imputations[i] = pd.concat([imputations[i], cat_df], axis = 1).drop(flat_cats, axis = 1)

imputations=imputations[0]

In [32]:
imputations.iloc[:,40:].columns

NameError: name 'imputations' is not defined

In [16]:
imputations.iloc[:,40:]

Unnamed: 0,"('Glascow coma scale eye opening', 'mean')_1.0","('Glascow coma scale eye opening', 'mean')_2.0","('Glascow coma scale eye opening', 'mean')_3.0","('Glascow coma scale eye opening', 'mean')_4.0","('Glascow coma scale motor response', 'mean')_1.0","('Glascow coma scale motor response', 'mean')_2.0","('Glascow coma scale motor response', 'mean')_3.0","('Glascow coma scale motor response', 'mean')_4.0","('Glascow coma scale motor response', 'mean')_5.0","('Glascow coma scale motor response', 'mean')_6.0",...,"('Glascow coma scale total', 'mean')_11.0","('Glascow coma scale total', 'mean')_12.0","('Glascow coma scale total', 'mean')_13.0","('Glascow coma scale total', 'mean')_14.0","('Glascow coma scale total', 'mean')_15.0","('Glascow coma scale verbal response', 'mean')_1.0","('Glascow coma scale verbal response', 'mean')_2.0","('Glascow coma scale verbal response', 'mean')_3.0","('Glascow coma scale verbal response', 'mean')_4.0","('Glascow coma scale verbal response', 'mean')_5.0"
0,"('Glascow coma scale eye opening', 'mean')_1.0_0","('Glascow coma scale eye opening', 'mean')_2.0_0","('Glascow coma scale eye opening', 'mean')_3.0_0","('Glascow coma scale eye opening', 'mean')_4.0_1","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...",...,"('Glascow coma scale total', 'mean')_11.0_0","('Glascow coma scale total', 'mean')_12.0_0","('Glascow coma scale total', 'mean')_13.0_0","('Glascow coma scale total', 'mean')_14.0_0","('Glascow coma scale total', 'mean')_15.0_1","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')..."
1,"('Glascow coma scale eye opening', 'mean')_1.0_0","('Glascow coma scale eye opening', 'mean')_2.0_0","('Glascow coma scale eye opening', 'mean')_3.0_0","('Glascow coma scale eye opening', 'mean')_4.0_0","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...",...,"('Glascow coma scale total', 'mean')_11.0_0","('Glascow coma scale total', 'mean')_12.0_0","('Glascow coma scale total', 'mean')_13.0_0","('Glascow coma scale total', 'mean')_14.0_0","('Glascow coma scale total', 'mean')_15.0_0","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')..."
2,"('Glascow coma scale eye opening', 'mean')_1.0_0","('Glascow coma scale eye opening', 'mean')_2.0_0","('Glascow coma scale eye opening', 'mean')_3.0_0","('Glascow coma scale eye opening', 'mean')_4.0_0","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...",...,"('Glascow coma scale total', 'mean')_11.0_0","('Glascow coma scale total', 'mean')_12.0_0","('Glascow coma scale total', 'mean')_13.0_0","('Glascow coma scale total', 'mean')_14.0_0","('Glascow coma scale total', 'mean')_15.0_0","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')..."
3,"('Glascow coma scale eye opening', 'mean')_1.0_0","('Glascow coma scale eye opening', 'mean')_2.0_0","('Glascow coma scale eye opening', 'mean')_3.0_0","('Glascow coma scale eye opening', 'mean')_4.0_0","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...",...,"('Glascow coma scale total', 'mean')_11.0_0","('Glascow coma scale total', 'mean')_12.0_0","('Glascow coma scale total', 'mean')_13.0_0","('Glascow coma scale total', 'mean')_14.0_0","('Glascow coma scale total', 'mean')_15.0_0","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')..."
4,"('Glascow coma scale eye opening', 'mean')_1.0_0","('Glascow coma scale eye opening', 'mean')_2.0_0","('Glascow coma scale eye opening', 'mean')_3.0_0","('Glascow coma scale eye opening', 'mean')_4.0_0","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...",...,"('Glascow coma scale total', 'mean')_11.0_0","('Glascow coma scale total', 'mean')_12.0_0","('Glascow coma scale total', 'mean')_13.0_0","('Glascow coma scale total', 'mean')_14.0_0","('Glascow coma scale total', 'mean')_15.0_0","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
843307,"('Glascow coma scale eye opening', 'mean')_1.0_0","('Glascow coma scale eye opening', 'mean')_2.0_0","('Glascow coma scale eye opening', 'mean')_3.0_0","('Glascow coma scale eye opening', 'mean')_4.0_0","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...",...,"('Glascow coma scale total', 'mean')_11.0_0","('Glascow coma scale total', 'mean')_12.0_0","('Glascow coma scale total', 'mean')_13.0_0","('Glascow coma scale total', 'mean')_14.0_0","('Glascow coma scale total', 'mean')_15.0_0","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')..."
843308,"('Glascow coma scale eye opening', 'mean')_1.0_0","('Glascow coma scale eye opening', 'mean')_2.0_0","('Glascow coma scale eye opening', 'mean')_3.0_0","('Glascow coma scale eye opening', 'mean')_4.0_0","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...",...,"('Glascow coma scale total', 'mean')_11.0_0","('Glascow coma scale total', 'mean')_12.0_0","('Glascow coma scale total', 'mean')_13.0_0","('Glascow coma scale total', 'mean')_14.0_0","('Glascow coma scale total', 'mean')_15.0_0","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')..."
843309,"('Glascow coma scale eye opening', 'mean')_1.0_0","('Glascow coma scale eye opening', 'mean')_2.0_0","('Glascow coma scale eye opening', 'mean')_3.0_0","('Glascow coma scale eye opening', 'mean')_4.0_0","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...",...,"('Glascow coma scale total', 'mean')_11.0_0","('Glascow coma scale total', 'mean')_12.0_0","('Glascow coma scale total', 'mean')_13.0_0","('Glascow coma scale total', 'mean')_14.0_0","('Glascow coma scale total', 'mean')_15.0_0","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')..."
843310,"('Glascow coma scale eye opening', 'mean')_1.0_0","('Glascow coma scale eye opening', 'mean')_2.0_0","('Glascow coma scale eye opening', 'mean')_3.0_0","('Glascow coma scale eye opening', 'mean')_4.0_1","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...","('Glascow coma scale motor response', 'mean')_...",...,"('Glascow coma scale total', 'mean')_11.0_0","('Glascow coma scale total', 'mean')_12.0_0","('Glascow coma scale total', 'mean')_13.0_0","('Glascow coma scale total', 'mean')_14.0_0","('Glascow coma scale total', 'mean')_15.0_1","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')...","('Glascow coma scale verbal response', 'mean')..."


In [21]:
def removeText(value):
    for i in imputations.iloc[:,40:].columns:
        value=str(value)
        if i in value:
            value=value[0]


    # if "Glascow coma scale eye opening_" in value:
    #     value=value.replace("Glascow coma scale eye opening_", "")
    #     value=float(value)
    # elif "Glascow coma scale motor response_" in value:
    #     value=value.replace("Glascow coma scale motor response_", "")
    #     value=float(value)
    # elif "Glascow coma scale total_" in value:
    #     value=value.replace("Glascow coma scale total_", "")
    #     value=float(value)
    # elif "Glascow coma scale verbal response_" in value:
    #     value=value.replace("Glascow coma scale verbal response_", "")
    #     value=float(value)
    
    return value

In [22]:
imputations[imputations.iloc[:,40:].columns] = imputations[imputations.iloc[:,40:].columns].apply(removeText)

In [26]:
imputations.iloc[:,48:58]

Unnamed: 0,"('Glascow coma scale motor response', 'mean')_5.0","('Glascow coma scale motor response', 'mean')_6.0","('Glascow coma scale total', 'mean')_3.0","('Glascow coma scale total', 'mean')_4.0","('Glascow coma scale total', 'mean')_5.0","('Glascow coma scale total', 'mean')_6.0","('Glascow coma scale total', 'mean')_7.0","('Glascow coma scale total', 'mean')_8.0","('Glascow coma scale total', 'mean')_9.0","('Glascow coma scale total', 'mean')_10.0"
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
843307,0,0,0,0,0,0,0,0,0,0
843308,0,0,0,0,0,0,0,0,0,0
843309,0,0,0,0,0,0,0,0,0,0
843310,0,0,0,0,0,0,0,0,0,0


In [31]:
imputations['Glascow coma scale eye opening']=imputations['Glascow coma scale eye opening'].apply(removeText)
imputations['Glascow coma scale motor response']=imputations['Glascow coma scale motor response'].apply(removeText)
imputations['Glascow coma scale total']=imputations['Glascow coma scale total'].apply(removeText)
imputations['Glascow coma scale verbal response']=imputations['Glascow coma scale verbal response'].apply(removeText)

KeyError: 'Glascow coma scale eye opening'

In [26]:
imputations=imputations[['icustay_id', 'subject_id', 'hadm_id', 'hours_in',
       'Diastolic blood pressure', 'Fraction inspired oxygen',
       'Glascow coma scale eye opening', 'Glascow coma scale motor response',
       'Glascow coma scale total', 'Glascow coma scale verbal response',
       'Glucose', 'Heart Rate', 'Height', 'Mean blood pressure',
       'Oxygen saturation', 'Respiratory rate', 'Systolic blood pressure',
       'Temperature', 'Weight', 'pH']]

In [35]:
midas_imputed = imputations.reset_index().set_index(['icustay_id', 'subject_id', 'hadm_id', 'hours_in']).drop('index', axis = 1)
midas_imputed.to_csv(data_path+'/imputed_data/midas_imputedv3.csv')

In [36]:
midas_imputed

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Diastolic blood pressure,Fraction inspired oxygen,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH,Glascow coma scale eye opening,Glascow coma scale motor response,Glascow coma scale total,Glascow coma scale verbal response
icustay_id,subject_id,hadm_id,hours_in,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
200003,27513,163557,0,49.000000,-16.224348,110.000000,119.000000,-410.519104,58.000000,97.000000,35.000000,91.000000,14.118531,77.500000,-25.230787,4.0,6.0,15.0,5.0
200003,27513,163557,1,52.000000,-22.304611,109.638176,118.000000,-334.816956,59.000000,96.000000,32.000000,88.333333,38.999999,77.500000,-89.606079,4.0,6.0,14.0,4.0
200003,27513,163557,2,52.333333,1.839921,62.048153,116.000000,368.759583,59.666667,95.000000,30.333333,85.333333,38.277790,77.500000,-40.151772,4.0,6.0,14.0,4.0
200003,27513,163557,3,60.500000,-34.386002,130.683426,112.000000,330.477997,65.500000,93.500000,32.500000,86.500000,37.777790,77.500000,1.754184,4.0,1.0,13.5,5.0
200003,27513,163557,4,61.000000,17.688700,123.097832,108.000000,79.589912,67.000000,91.500000,37.000000,89.500000,36.833318,77.500000,3.599597,4.0,6.0,15.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,28775,134959,43,82.000000,-14.951100,110.660110,74.000000,457.235596,93.000000,91.105774,19.000000,145.000000,40.342030,62.782387,-25.925615,4.0,6.0,14.0,5.0
299995,28775,134959,44,46.086784,-31.086552,191.171448,72.000000,-187.846924,68.887207,89.020836,20.000000,118.126953,13.254373,92.296196,-36.942841,4.0,3.5,15.0,1.0
299995,28775,134959,45,33.928425,89.005859,190.334549,82.000000,-112.851791,56.774078,105.407890,20.000000,120.194992,24.558828,98.813690,-40.680080,4.0,1.0,15.0,4.0
299995,28775,134959,46,65.000000,-80.024994,105.623512,73.000000,753.685791,81.000000,97.529556,20.000000,128.000000,36.555578,40.550140,-17.299129,4.0,6.0,15.0,5.0
