# Development of machine learning models to process Electronic Health Records – Explainable Models

### Leave One Out Cross validation Experiment Notebook
Lok Hang Toby Lee (2431180L)

In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')

# SET YOUR PATH FOR RESOURCES FILE HERE
resources_path = "C:/Users/USER/OneDrive/GU/Year 4/FYP/MIMIC-III-ML/data/resources"
data_path = "C:/Users/USER/OneDrive/GU/Year 4/FYP/MIMIC-III-ML/data/raw"

In [3]:
# Read in the data and only select the continuous variables:
config = json.load(open(resources_path + '/discretizer_config.json', 'r'))
is_categorical = config['is_categorical_channel']
categorical_var = [key for key, value in config['is_categorical_channel'].items() if value][1:]
patient_identifiers = ['icustay_id', 'subject_id', 'hadm_id', 'hours_in']
data = pd.read_csv(data_path+'/mimic_timeseries_data_not_imputed.csv')

# Filter out subjects with length of stay < 48 hours for the in-hospital mortality task:
y = pd.read_hdf(data_path+'/vitals_hourly_data_preprocessed.h5', 'Y')['los']
indices_to_remove = []
for i, row in y.iteritems():
    if row < 48:
        indices_to_remove.append(i)
        
data = data.reset_index().set_index('icustay_id').drop(indices_to_remove, axis = 0)
y = y.drop(indices_to_remove, axis = 0)

# Extract only the first 48 hours:
data = data[data['hours_in'] < 48].reset_index().set_index(['icustay_id', 'subject_id', 'hadm_id', 'hours_in']).drop('index', axis = 1)
data = data.drop('Unnamed: 0', axis = 1)


In [5]:
icu_stay_to_remove=[]
for i in range(0,843312,48):
    if(any(~np.isnan(data.iloc[i:i+48,:].values).reshape(48*16,))):
        continue
    else:
        icu_stay_to_remove.append(data.reset_index().iloc[i]['icustay_id'])
loocv_data=data.drop(icu_stay_to_remove)

In [4]:
import random

loocv_coord=[]
loocv_true=[]
for i in range(0,len(loocv_data),48):
    if(any(~np.isnan(loocv_data.iloc[i:i+48,:].values).reshape(48*16,))):
        while(True):
            x=random.randint(0,47)+i
            # Select random value from random row for each patient that is not null
            notempty = ~np.isnan(loocv_data.iloc[x,:].values)
            if(any(notempty)):
                break
        
        y=random.choice(np.squeeze(np.argwhere(notempty),axis=1))
        value = loocv_data.iloc[x,y]
        coord=[x,y]
        loocv_true.append(value)
        loocv_coord.append(coord)



In [2]:
from numpy import loadtxt
loocv_coord = loadtxt(data_path+'/imputed_data/loocv1_cord.csv', delimiter=',').astype(int)
loocv_true=loadtxt(data_path+'/imputed_data/loocv1_true.csv', delimiter=',').astype(int)

In [7]:
# Holdout values and replace with np.nan
loocv_data=loocv_data.reset_index()
for i in loocv_coord:
    y=loocv_data.columns[i[1]+4]
    loocv_data.loc[i[0],y]=np.nan
loocv_data=loocv_data.set_index(['icustay_id', 'subject_id', 'hadm_id', 'hours_in'])

In [None]:
from numpy import savetxt
savetxt(data_path+'/imputed_data/loocv1_cord.csv', loocv_coord, delimiter=',')
savetxt(data_path+'/imputed_data/loocv1_true.csv', loocv_true, delimiter=',')


# Mean/ Most frequent imputation

In [21]:
from sklearn.impute import SimpleImputer
# Define the SimpleImputer instances
mean_imputer = SimpleImputer(missing_values=np.NaN, strategy='mean')
most_frequent_imputer = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')
results_mean_imp = pd.DataFrame(index = loocv_data.index, columns = loocv_data.columns)
results_most_frequent_imp = pd.DataFrame(index = loocv_data.index, columns = loocv_data.columns)


mean_imp = mean_imputer.fit_transform(loocv_data)
results_mean_imp.loc[:,:] = mean_imp

most_frequent_imp= most_frequent_imputer.fit_transform(loocv_data)
results_most_frequent_imp.loc[:,:] = most_frequent_imp


In [22]:
results_mean_imp.to_csv(data_path+'/imputed_data/mean_imputed_loocv1.csv')
results_most_frequent_imp.to_csv(data_path+'/imputed_data/most_frequent_imputed_loocv1.csv')

# KNN imputation

In [67]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
#Caution! the knn imputer takes a very long time to run
results_knn_imp = pd.DataFrame(index = loocv_data.index, columns = loocv_data.columns)
scaler=StandardScaler()
scaled_data= scaler.fit_transform(loocv_data)

knn_imputer = KNNImputer(n_neighbors=5)
knn_imp = knn_imputer.fit_transform(scaled_data)

In [68]:
knn_imp_results=scaler.inverse_transform(knn_imp)
knn_imp_data = pd.DataFrame(data=knn_imp_results, index= loocv_data.index, columns = loocv_data.columns)

knn_imp_data.to_csv(data_path+'/imputed_data/knn_imputed_loocv1.csv')

# Mice imputation

In [54]:
import miceforest as mf
from sklearn.preprocessing import StandardScaler

scaler=MinMaxScaler()
scaled_data= scaler.fit_transform(loocv_data)
scaled_data = pd.DataFrame(data=scaled_data, index= loocv_data.index, columns = loocv_data.columns)


mf.ampute_data(loocv_data,perc=0.25)

# Create kernels. 
kernel = mf.ImputationKernel(
  data=loocv_data,
  datasets=4,
  save_all_iterations=True,
  random_state=1991
)

In [52]:
optimal_parameters={7: {'boosting': 'gbdt', 'num_iterations': 5000, 'max_depth': 8, 'num_leaves': 14, 'min_data_in_leaf': 29025, 'min_sum_hessian_in_leaf': 0.1, 'min_gain_to_split': 0.0, 'bagging_fraction': 0.6116460024199847, 'feature_fraction': 1.0, 'feature_fraction_bynode': 0.4853652724039139, 'bagging_freq': 1, 'verbosity': -1, 'objective': 'regression', 'seed': 635355, 'learning_rate': 0.05, 'cat_smooth': 23.629257396732374}, 11: {'boosting': 'gbdt', 'num_iterations': 5000, 'max_depth': 8, 'num_leaves': 18, 'min_data_in_leaf': 46539, 'min_sum_hessian_in_leaf': 0.1, 'min_gain_to_split': 0.0, 'bagging_fraction': 0.7406864914680863, 'feature_fraction': 1.0, 'feature_fraction_bynode': 0.6938637860511762, 'bagging_freq': 1, 'verbosity': -1, 'objective': 'regression', 'seed': 963763, 'learning_rate': 0.05, 'cat_smooth': 20.277040417276172}, 12: {'boosting': 'gbdt', 'num_iterations': 5000, 'max_depth': 8, 'num_leaves': 13, 'min_data_in_leaf': 28478, 'min_sum_hessian_in_leaf': 0.1, 'min_gain_to_split': 0.0, 'bagging_fraction': 0.7224280952458146, 'feature_fraction': 1.0, 'feature_fraction_bynode': 0.549284787782835, 'bagging_freq': 1, 'verbosity': -1, 'objective': 'regression', 'seed': 216295, 'learning_rate': 0.05, 'cat_smooth': 20.845191150511962}, 0: {'boosting': 'gbdt', 'num_iterations': 5000, 'max_depth': 8, 'num_leaves': 16, 'min_data_in_leaf': 53130, 'min_sum_hessian_in_leaf': 0.1, 'min_gain_to_split': 0.0, 'bagging_fraction': 0.9771906149487628, 'feature_fraction': 1.0, 'feature_fraction_bynode': 0.9208567620588631, 'bagging_freq': 1, 'verbosity': -1, 'objective': 'regression', 'seed': 592335, 'learning_rate': 0.05, 'cat_smooth': 2.9558111585524545}, 9: {'boosting': 'gbdt', 'num_iterations': 2407, 'max_depth': 8, 'num_leaves': 12, 'min_data_in_leaf': 4186, 'min_sum_hessian_in_leaf': 0.1, 'min_gain_to_split': 0.0, 'bagging_fraction': 0.3185085699966633, 'feature_fraction': 1.0, 'feature_fraction_bynode': 0.8664728224215659, 'bagging_freq': 1, 'verbosity': -1, 'objective': 'regression', 'seed': 347490, 'learning_rate': 0.05, 'cat_smooth': 9.160646730591216}, 10: {'boosting': 'gbdt', 'num_iterations': 1771, 'max_depth': 8, 'num_leaves': 23, 'min_data_in_leaf': 3095, 'min_sum_hessian_in_leaf': 0.1, 'min_gain_to_split': 0.0, 'bagging_fraction': 0.35404184173949216, 'feature_fraction': 1.0, 'feature_fraction_bynode': 0.26378529369316644, 'bagging_freq': 1, 'verbosity': -1, 'objective': 'regression', 'seed': 549843, 'learning_rate': 0.05, 'cat_smooth': 12.109747961091713}, 14: {'boosting': 'gbdt', 'num_iterations': 1366, 'max_depth': 8, 'num_leaves': 17, 'min_data_in_leaf': 648, 'min_sum_hessian_in_leaf': 0.1, 'min_gain_to_split': 0.0, 'bagging_fraction': 0.5240446778187317, 'feature_fraction': 1.0, 'feature_fraction_bynode': 0.6898016586044188, 'bagging_freq': 1, 'verbosity': -1, 'objective': 'regression', 'seed': 695751, 'learning_rate': 0.05, 'cat_smooth': 9.359330730126967}, 13: {'boosting': 'gbdt', 'num_iterations': 4306, 'max_depth': 8, 'num_leaves': 24, 'min_data_in_leaf': 9906, 'min_sum_hessian_in_leaf': 0.1, 'min_gain_to_split': 0.0, 'bagging_fraction': 0.8139546334618256, 'feature_fraction': 1.0, 'feature_fraction_bynode': 0.7501835065984781, 'bagging_freq': 1, 'verbosity': -1, 'objective': 'regression', 'seed': 477155, 'learning_rate': 0.05, 'cat_smooth': 9.664000784964383}, 2: {'boosting': 'gbdt', 'num_iterations': 4927, 'max_depth': 8, 'num_leaves': 8, 'min_data_in_leaf': 5976, 'min_sum_hessian_in_leaf': 0.1, 'min_gain_to_split': 0.0, 'bagging_fraction': 0.6467662968495774, 'feature_fraction': 1.0, 'feature_fraction_bynode': 0.7881371834576874, 'bagging_freq': 1, 'verbosity': -1, 'objective': 'regression', 'seed': 599809, 'learning_rate': 0.05, 'cat_smooth': 14.064176006097743}, 5: {'boosting': 'gbdt', 'num_iterations': 4999, 'max_depth': 8, 'num_leaves': 14, 'min_data_in_leaf': 8341, 'min_sum_hessian_in_leaf': 0.1, 'min_gain_to_split': 0.0, 'bagging_fraction': 0.8113716443633006, 'feature_fraction': 1.0, 'feature_fraction_bynode': 0.6271055492984837, 'bagging_freq': 1, 'verbosity': -1, 'objective': 'regression', 'seed': 460998, 'learning_rate': 0.05, 'cat_smooth': 14.388532491341138}, 3: {'boosting': 'gbdt', 'num_iterations': 1820, 'max_depth': 8, 'num_leaves': 24, 'min_data_in_leaf': 1226, 'min_sum_hessian_in_leaf': 0.1, 'min_gain_to_split': 0.0, 'bagging_fraction': 0.7307860355171651, 'feature_fraction': 1.0, 'feature_fraction_bynode': 0.4027123149811854, 'bagging_freq': 1, 'verbosity': -1, 'objective': 'regression', 'seed': 476956, 'learning_rate': 0.05, 'cat_smooth': 1.0531589531064978}, 6: {'boosting': 'gbdt', 'num_iterations': 2378, 'max_depth': 8, 'num_leaves': 9, 'min_data_in_leaf': 7576, 'min_sum_hessian_in_leaf': 0.1, 'min_gain_to_split': 0.0, 'bagging_fraction': 0.8781416185329748, 'feature_fraction': 1.0, 'feature_fraction_bynode': 0.7074383770513859, 'bagging_freq': 1, 'verbosity': -1, 'objective': 'regression', 'seed': 359880, 'learning_rate': 0.05, 'cat_smooth': 0.009484523593586425}, 4: {'boosting': 'gbdt', 'num_iterations': 5000, 'max_depth': 8, 'num_leaves': 13, 'min_data_in_leaf': 212, 'min_sum_hessian_in_leaf': 0.1, 'min_gain_to_split': 0.0, 'bagging_fraction': 0.7782329350790853, 'feature_fraction': 1.0, 'feature_fraction_bynode': 0.4326679622458992, 'bagging_freq': 1, 'verbosity': -1, 'objective': 'regression', 'seed': 51950, 'learning_rate': 0.05, 'cat_smooth': 24.501566520213196}, 15: {'boosting': 'gbdt', 'num_iterations': 1756, 'max_depth': 8, 'num_leaves': 22, 'min_data_in_leaf': 2499, 'min_sum_hessian_in_leaf': 0.1, 'min_gain_to_split': 0.0, 'bagging_fraction': 0.9940340854543842, 'feature_fraction': 1.0, 'feature_fraction_bynode': 0.3717374193526257, 'bagging_freq': 1, 'verbosity': -1, 'objective': 'regression', 'seed': 537943, 'learning_rate': 0.05, 'cat_smooth': 14.890905942285512}, 1: {'boosting': 'gbdt', 'num_iterations': 786, 'max_depth': 8, 'num_leaves': 8, 'min_data_in_leaf': 1328, 'min_sum_hessian_in_leaf': 0.1, 'min_gain_to_split': 0.0, 'bagging_fraction': 0.9464946314857885, 'feature_fraction': 1.0, 'feature_fraction_bynode': 0.9161614326713303, 'bagging_freq': 1, 'verbosity': -1, 'objective': 'regression', 'seed': 522585, 'learning_rate': 0.05, 'cat_smooth': 4.6286175450106635}, 8: {'boosting': 'gbdt', 'num_iterations': 109, 'max_depth': 8, 'num_leaves': 12, 'min_data_in_leaf': 324, 'min_sum_hessian_in_leaf': 0.1, 'min_gain_to_split': 0.0, 'bagging_fraction': 0.5733322046975943, 'feature_fraction': 1.0, 'feature_fraction_bynode': 0.36988828476178337, 'bagging_freq': 1, 'verbosity': -1, 'objective': 'regression', 'seed': 107227, 'learning_rate': 0.05, 'cat_smooth': 7.310993877763838}}

In [None]:
kernel.mice(3, verbose=True, variable_parameters=optimal_parameters)

In [None]:
mice_imputed_data = kernel.complete_data(2)

In [None]:
completed_dataset1 = kernel.complete_data(dataset=0, inplace=False)
completed_dataset2 = kernel.complete_data(dataset=1, inplace=False)
completed_dataset3 = kernel.complete_data(dataset=2, inplace=False)
completed_dataset4 = kernel.complete_data(dataset=3, inplace=False)

mice_imputed_results= (completed_dataset1 + completed_dataset2 + completed_dataset3+ completed_dataset4)/4

In [None]:
midas_imputed = pd.DataFrame( data = scaler.inverse_transform(mice_imputed_results), index = loocv_data.index, columns = loocv_data.columns)

In [None]:
mice_imputed_results.to_csv(data_path+'/imputed_data/mice_imputed_loocv1.csv')
mice_imputed_results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Diastolic blood pressure,Fraction inspired oxygen,Glascow coma scale eye opening,Glascow coma scale motor response,Glascow coma scale total,Glascow coma scale verbal response,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
icustay_id,subject_id,hadm_id,hours_in,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
200003,27513,163557,0,49.000000,0.6625,4.00,6.00,15.00,5.00,110.000000,119.00,168.54625,58.000000,97.000000,35.000000,91.000000,37.847224,77.500000,7.325000
200003,27513,163557,1,52.000000,0.7000,3.75,5.50,13.25,4.00,119.500000,118.00,176.51500,59.000000,96.000000,32.000000,88.333333,38.999999,77.500000,7.300000
200003,27513,163557,2,52.333333,1.0000,3.75,6.00,14.75,5.00,109.464286,116.00,166.56000,59.666667,95.000000,30.333333,85.333333,38.277790,77.500000,7.353750
200003,27513,163557,3,60.500000,0.7125,3.00,5.25,10.25,2.00,140.750000,112.00,167.00250,65.500000,93.500000,32.500000,86.500000,37.777790,77.500000,7.290417
200003,27513,163557,4,61.000000,0.8375,2.75,5.50,11.25,3.00,159.500000,108.00,164.35750,67.000000,91.500000,37.000000,89.500000,36.833318,77.500000,7.250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,28775,134959,43,82.000000,0.5000,3.00,5.25,11.25,3.00,143.750000,74.00,168.95500,93.000000,98.500000,19.000000,145.000000,37.135185,80.800001,7.425000
299995,28775,134959,44,56.125000,0.3775,4.00,6.00,14.75,4.75,131.500000,72.00,168.13750,74.333334,97.250000,20.000000,112.125000,37.172211,72.150000,7.373750
299995,28775,134959,45,52.875000,0.5750,4.00,5.75,13.25,3.50,122.750000,82.00,166.02625,73.041662,98.750000,20.000000,113.458333,36.461104,48.624999,7.395000
299995,28775,134959,46,65.000000,0.5125,4.00,6.00,15.00,5.00,148.500000,73.00,175.25500,81.000000,96.666667,20.000000,128.000000,36.555578,68.625000,7.340000


# MIDAS Imputation

In [18]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import tensorflow as tf
import MIDASpy as md

In [11]:
results_frame = pd.DataFrame(index = loocv_data.index, columns = loocv_data.columns)

In [14]:
from sklearn.preprocessing import MinMaxScaler

scaler=MinMaxScaler()
scaled_data= scaler.fit_transform(loocv_data)
scaled_data= pd.DataFrame(data= scaled_data, index = loocv_data.index, columns = loocv_data.columns)

In [19]:
data_0=scaled_data.reset_index()
categorical  = categorical_var
data_cat, cat_cols_list = md.cat_conv(data_0[categorical])

data_0.drop(categorical, axis = 1, inplace = True)
constructor_list = [data_0]
constructor_list.append(data_cat)
data_in = pd.concat(constructor_list, axis=1)

na_loc = data_in.isnull()
data_in[na_loc] = np.nan


In [23]:
imputer = md.Midas(layer_structure = [256,256], vae_layer = False, seed = 2000, input_drop = 0.85,learn_rate=0.0001)
imputer.build_model(data_in)
imputer.train_model(training_epochs = 25)

Size index: [110]

Computation graph constructed

Model initialised

Epoch: 0 , loss: 126384.64530581654
Epoch: 1 , loss: 114320.34381198621
Epoch: 2 , loss: 111922.07197128375
Epoch: 3 , loss: 108789.30278934953
Epoch: 4 , loss: 107082.16220798602
Epoch: 5 , loss: 106601.39003460188
Epoch: 6 , loss: 106159.72356232278
Epoch: 7 , loss: 105745.42315634667
Epoch: 8 , loss: 105323.82978915439
Epoch: 9 , loss: 104992.04880880559
Epoch: 10 , loss: 104711.60154817828
Epoch: 11 , loss: 104397.103358459
Epoch: 12 , loss: 104230.87743983981
Epoch: 13 , loss: 103965.90031125878
Epoch: 14 , loss: 103727.49574943191
Epoch: 15 , loss: 103318.28299314766
Epoch: 16 , loss: 103096.52360846558
Epoch: 17 , loss: 102971.0177314122
Epoch: 18 , loss: 102793.57156706804
Epoch: 19 , loss: 102675.18491985205
Epoch: 20 , loss: 102557.56511572847
Epoch: 21 , loss: 102364.34518157558
Epoch: 22 , loss: 102488.24283078406
Epoch: 23 , loss: 102419.14928374974
Epoch: 24 , loss: 102165.82188781496
Training complete. 

<MIDASpy.midas_base.Midas at 0x1ae52fa2ac8>

In [24]:
imputations = imputer.generate_samples(m=10).output_list 
imputations=[imputations]

results=imputations

INFO:tensorflow:Restoring parameters from tmp/MIDAS
Model restored.


In [25]:
imputations=imputations[0]

flat_cats = [cat for variable in cat_cols_list for cat in variable]

for i in range(len(imputations)):
    tmp_cat = [imputations[i][x].idxmax(axis=1) for x in cat_cols_list]
    cat_df = pd.DataFrame({categorical[i]:tmp_cat[i] for i in range(len(categorical))})
    imputations[i] = pd.concat([imputations[i], cat_df], axis = 1).drop(flat_cats, axis = 1)

imputations=imputations[0]

In [26]:
def removeText(value):
    if "Glascow coma scale eye opening_" in value:
        value=value.replace("Glascow coma scale eye opening_", "")
        value=float(value)
    elif "Glascow coma scale motor response_" in value:
        value=value.replace("Glascow coma scale motor response_", "")
        value=float(value)
    elif "Glascow coma scale total_" in value:
        value=value.replace("Glascow coma scale total_", "")
        value=float(value)
    elif "Glascow coma scale verbal response_" in value:
        value=value.replace("Glascow coma scale verbal response_", "")
        value=float(value)
    
    return value

imputations['Glascow coma scale eye opening']=imputations['Glascow coma scale eye opening'].apply(removeText)
imputations['Glascow coma scale motor response']=imputations['Glascow coma scale motor response'].apply(removeText)
imputations['Glascow coma scale total']=imputations['Glascow coma scale total'].apply(removeText)
imputations['Glascow coma scale verbal response']=imputations['Glascow coma scale verbal response'].apply(removeText)

In [27]:
imputations=imputations[['icustay_id', 'subject_id', 'hadm_id', 'hours_in',
       'Diastolic blood pressure', 'Fraction inspired oxygen',
       'Glascow coma scale eye opening', 'Glascow coma scale motor response',
       'Glascow coma scale total', 'Glascow coma scale verbal response',
       'Glucose', 'Heart Rate', 'Height', 'Mean blood pressure',
       'Oxygen saturation', 'Respiratory rate', 'Systolic blood pressure',
       'Temperature', 'Weight', 'pH']]

In [28]:
midas_imputed = imputations.reset_index().set_index(['icustay_id', 'subject_id', 'hadm_id', 'hours_in']).drop('index', axis = 1)
midas_imputed = pd.DataFrame( data = scaler.inverse_transform(midas_imputed), index = loocv_data.index, columns = loocv_data.columns)
midas_imputed.to_csv(data_path+'/imputed_data/midas_imputed_loocv4.csv')

# LOOCV Prediction

In [3]:
results_most_frequent_imp=pd.read_csv(data_path+'/imputed_data/mean_imputed_loocv1.csv').set_index(['icustay_id', 'subject_id', 'hadm_id', 'hours_in'])
results_mean_imp=pd.read_csv(data_path+'/imputed_data/most_frequent_imputed_loocv1.csv').set_index(['icustay_id', 'subject_id', 'hadm_id', 'hours_in'])
mice_imputed_results=pd.read_csv(data_path+'/imputed_data/mice_imputed_loocv1.csv').set_index(['icustay_id', 'subject_id', 'hadm_id', 'hours_in'])
midas_imputed=pd.read_csv(data_path+'/imputed_data/midas_imputed_loocv1.csv').set_index(['icustay_id', 'subject_id', 'hadm_id', 'hours_in'])
knn_imputed_results=pd.read_csv(data_path+'/imputed_data/knn_imputed_loocv1.csv').set_index(['icustay_id', 'subject_id', 'hadm_id', 'hours_in'])

In [4]:
mf_pred=[]
mean_pred=[]
knn_pred=[]
mice_pred=[]
midas_pred=[]
for x,y in loocv_coord:
    mf_pred.append(results_most_frequent_imp.iloc[x,y])
    mean_pred.append(results_mean_imp.iloc[x,y])
    knn_pred.append(knn_imputed_results.iloc[x,y])
    mice_pred.append(mice_imputed_results.iloc[x,y])
    midas_pred.append(midas_imputed.iloc[x,y])

In [5]:
from sklearn.metrics import mean_squared_error

print("RMSE for mean Imputed: {}".format(np.sqrt(mean_squared_error(loocv_true, mean_pred))))

print("RMSE for most frequent Imputed: {}".format(np.sqrt(mean_squared_error(loocv_true, mf_pred))))

print("RMSE for KNN Imputed: {}".format(np.sqrt(mean_squared_error(loocv_true, knn_pred))))

print("RMSE for MICE Imputed: {}".format(np.sqrt(mean_squared_error(loocv_true, mice_pred))))

print("RMSE for MIDAS Imputed: {}".format(np.sqrt(mean_squared_error(loocv_true, midas_pred))))

RMSE for mean Imputed: 26.541224518153005
RMSE for most frequent Imputed: 18.95382562210686
RMSE for KNN Imputed: 21.10296591283576
RMSE for MICE Imputed: 18.16232819698004
RMSE for MIDAS Imputed: 29.089633723419777
