In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn import metrics

In [2]:
# Read the data
df1 = pd.read_excel('1_Clinical data AH_AUD_HC Bernd_bei_updated08012019.xlsx')
df = df1[['age', 'alt', 'ast', 'creatinine_value_mg_dl',
          'bilirubin_value_mg_dl', 'platelets_value_10to9_l', 'wbc_10to9_l', 'alk_phos',
          'albumin_value_g_dl', 'sodium', 'inr', 'dialysis_required',
          'ThirtyMo', 'NinetyMo']]

df_30 = df[pd.notnull(df['ThirtyMo'])].drop(['NinetyMo'], axis = 1)
df_90 = df[pd.notnull(df['NinetyMo'])].drop(['ThirtyMo'], axis = 1)

# Fill dialysis
df_30.fillna({'dialysis_required': 0}, inplace = True)
df_90.fillna({'dialysis_required': 0}, inplace = True)

# Calculate MELD score (2016) before imputation
df_30['MELD'] = df_30.apply(lambda row:
                            9.57 * np.log(np.where(row.dialysis_required == 0,
                                                   np.minimum(np.maximum(row.creatinine_value_mg_dl, 1.0), 4.0),
                                                   4)) +
                            3.78 * np.log(np.maximum(row.bilirubin_value_mg_dl, 1.0)) +
                            11.2 * np.log(np.maximum(row.inr, 1.0)) +
                            6.43,
                            axis = 1)
df_30['MELD_2016'] = df_30.apply(lambda row:
                                 np.where(row.MELD > 11,
                                          row.MELD +
                                          1.32 * (137 - np.minimum(np.maximum(row.sodium, 125), 137)) -
                                          0.033 * row.MELD * (137 - np.minimum(np.maximum(row.sodium, 125), 137)),
                                          row.MELD),
                                 axis = 1)

df_90['MELD'] = df_90.apply(lambda row:
                            9.57 * np.log(np.where(row.dialysis_required == 0,
                                                   np.minimum(np.maximum(row.creatinine_value_mg_dl, 1.0), 4.0),
                                                   4)) +
                            3.78 * np.log(np.maximum(row.bilirubin_value_mg_dl, 1.0)) +
                            11.2 * np.log(np.maximum(row.inr, 1.0)) +
                            6.43,
                            axis = 1)
df_90['MELD_2016'] = df_90.apply(lambda row:
                                 np.where(row.MELD > 11,
                                          row.MELD +
                                          1.32 * (137 - np.minimum(np.maximum(row.sodium, 125), 137)) -
                                          0.033 * row.MELD * (137 - np.minimum(np.maximum(row.sodium, 125), 137)),
                                          row.MELD),
                                 axis = 1)

df_30 = df_30.drop(['dialysis_required'], axis = 1)
df_90 = df_90.drop(['dialysis_required'], axis = 1)

# Export the data with MELD score (2016)
df_30.to_csv('MELD1_30.csv', index = False)
df_90.to_csv('MELD1_90.csv', index = False)

In [3]:
print(df_30.shape)
print(df_90.shape)

(210, 14)
(158, 14)


In [4]:
print(df_30.dropna().shape)
print(df_90.dropna().shape)

(176, 14)
(131, 14)


In [5]:
# Descriptive statistics for each column
print(df_30.describe())
print(df_90.describe())

              age         alt          ast  creatinine_value_mg_dl  \
count  198.000000  194.000000   195.000000              195.000000   
mean    49.324523   57.530928   165.210256                1.145386   
std     10.557878   42.966206   152.584342                1.105042   
min     26.425000   14.000000    34.000000                0.305100   
25%     39.840972   32.000000    98.500000                0.600000   
50%     49.416667   44.000000   130.000000                0.770000   
75%     58.185417   65.750000   196.000000                1.110000   
max     74.763889  404.000000  1858.000000                8.140000   

       bilirubin_value_mg_dl  platelets_value_10to9_l  wbc_10to9_l  \
count             194.000000               191.000000   192.000000   
mean               16.044776               136.465969    11.612708   
std                 9.389153                79.246775     7.903400   
min                 2.500000                12.200000     0.000000   
25%                

In [6]:
# Observations missing rate
print(np.round(1 - df_30.dropna().shape[0] / df_30.shape[0], decimals = 2))
print(np.round(1 - df_90.dropna().shape[0] / df_90.shape[0], decimals = 2))

0.16
0.17


In [7]:
# Creating feature matrix
X_30 = df_30.drop(['ThirtyMo'] , axis = 1)
X_90 = df_90.drop(['NinetyMo'] , axis = 1)

# Creating labels
y_30 = df_30['ThirtyMo']
y_90 = df_90['NinetyMo']

# Saving feature names for later use
X_list_30_raw = list(X_30.columns)
X_list_90_raw = list(X_90.columns)
X_list_30 = [i for i in X_list_30_raw if i not in ('MELD', 'MELD_2016')]
X_list_90 = [i for i in X_list_90_raw if i not in ('MELD', 'MELD_2016')]

In [8]:
# Check whether data is balanced
print(y_30.value_counts())
print(y_90.value_counts())

0    179
1     31
Name: ThirtyMo, dtype: int64
0.0    104
1.0     54
Name: NinetyMo, dtype: int64


In [9]:
# For died within 30 days outcome

# Stratified 5-fold split
skf = StratifiedKFold(n_splits = 5)
skf.get_n_splits(X_30, y_30)

i = 0
for train_index, test_index in skf.split(X_30, y_30):
    X_train_30, y_train_30 = X_30.iloc[train_index], y_30.iloc[train_index]
    X_test_30, y_test_30 = X_30.iloc[test_index], y_30.iloc[test_index]
    
    # Export the raw data
    train_30 = pd.DataFrame(np.column_stack([X_train_30, y_train_30]))
    train_30.columns = np.concatenate((X_list_30_raw, 'ThirtyMo'), axis = None)
    test_30 = pd.DataFrame(np.column_stack([X_test_30, y_test_30]))
    test_30.columns = np.concatenate((X_list_30_raw, 'ThirtyMo'), axis = None)
    
    train_30.to_csv('train1_30_raw' + str(i) + '.csv', index = False)
    test_30.to_csv('test1_30_raw' + str(i) + '.csv', index = False)
    
    X_train_30 = X_train_30.drop(['MELD', 'MELD_2016'], axis = 1)
    X_test_30 = X_test_30.drop(['MELD', 'MELD_2016'], axis = 1)
    
    # Apply MICE imputation to training data and use the same imputation model to test data
    MICE_imputer = IterativeImputer(sample_posterior = True, min_value = 0, random_state = 0)
    X_train_30_imp = MICE_imputer.fit_transform(X_train_30)
    X_test_30_imp = MICE_imputer.transform(X_test_30)
    
    # Use SMOTE to over sample the minority class for training data
    sm = SMOTE(random_state = 0)
    X_train_30_res, y_train_30_res = sm.fit_resample(X_train_30_imp, y_train_30)
    
    train_30 = pd.DataFrame(np.column_stack([X_train_30_res, y_train_30_res]))
    train_30.columns = np.concatenate((X_list_30, 'ThirtyMo'), axis = None)
    test_30 = pd.DataFrame(np.column_stack([X_test_30_imp, y_test_30]))
    test_30.columns = np.concatenate((X_list_30, 'ThirtyMo'), axis = None)
    
    train_30.to_csv('train1_30_' + str(i) + '.csv', index = False)
    test_30.to_csv('test1_30_' + str(i) + '.csv', index = False)
    
    i += 1

In [10]:
# For died within 90 days outcome

# Stratified 5-fold split
skf = StratifiedKFold(n_splits = 5)
skf.get_n_splits(X_90, y_90)

i = 0
for train_index, test_index in skf.split(X_90, y_90):
    X_train_90, y_train_90 = X_90.iloc[train_index], y_90.iloc[train_index]
    X_test_90, y_test_90 = X_90.iloc[test_index], y_90.iloc[test_index]
    
    # Export the raw training data
    train_90 = pd.DataFrame(np.column_stack([X_train_90, y_train_90]))
    train_90.columns = np.concatenate((X_list_90_raw, 'NinetyMo'), axis = None)
    test_90 = pd.DataFrame(np.column_stack([X_test_90, y_test_90]))
    test_90.columns = np.concatenate((X_list_90_raw, 'NinetyMo'), axis = None)
    
    train_90.to_csv('train1_90_raw' + str(i) + '.csv', index = False)
    test_90.to_csv('test1_90_raw' + str(i) + '.csv', index = False)
    
    X_train_90 = X_train_90.drop(['MELD', 'MELD_2016'], axis = 1)
    X_test_90 = X_test_90.drop(['MELD', 'MELD_2016'], axis = 1)
    
    # Apply MICE imputation to training data and use the same imputation model to test data
    MICE_imputer = IterativeImputer(sample_posterior = True, min_value = 0, random_state = 0)
    X_train_90_imp = MICE_imputer.fit_transform(X_train_90)
    X_test_90_imp = MICE_imputer.transform(X_test_90)
    
    # Use SMOTE to over sample the minority class for training data
    sm = SMOTE(random_state = 0)
    X_train_90_res, y_train_90_res = sm.fit_resample(X_train_90_imp, y_train_90)
    
    train_90 = pd.DataFrame(np.column_stack([X_train_90_res, y_train_90_res]))
    train_90.columns = np.concatenate((X_list_90, 'NinetyMo'), axis = None)
    test_90 = pd.DataFrame(np.column_stack([X_test_90_imp, y_test_90]))
    test_90.columns = np.concatenate((X_list_90, 'NinetyMo'), axis = None)
    
    train_90.to_csv('train1_90_' + str(i) + '.csv', index = False)
    test_90.to_csv('test1_90_' + str(i) + '.csv', index = False)
    
    i += 1