In [1]:
###
#  FUTON Model MDP + Q-Learning Creation Script
#  A Research Project conducted by Noah Dunn 
###

# Import the standard tools for working with Pandas dataframe
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shelve
# Import the MDP toolbox that contains a method for conducting Q-Learning
# Tool can be found here: https://github.com/sawcordwell/pymdptoolbox
# Documentation for the tool can be found here 
import mdptoolbox

In [2]:
#  The Data File that will be used to conduct the experiments
patientdata = pd.read_csv("G:/MIMIC-ALL/MIMIC-PATIENTS/patient_data_modified.csv")

In [3]:
### 
#  An MDP, or Markov Decision Process is used to model relationships between various states and actions.
#  A state can be thought of in medical solution as a patient's diagnosis based on current vitals and state of being. 
#  An action can be thought of as a change in current diagnosis based on one of those vitals.
#  The inspirations for the bulk of this code came from Komorowksi's AI Clinician which can be found 
#  here: https://github.com/matthieukomorowski/AI_Clinician/blob/master/AIClinician_core_160219.m
###

###
# Begin by establishing some global variables for use in the MDP creation
###
mdp_count = 500            # The number of repititions we want/count of MDPs we need to create 
clustering_iter = 32       # The number of times clustering will be conducted
cluster_sample = 0.25      # Proportion of the data used for clustering
gamma = 0.99               # How close we desire clusters to be in similarity (Percentage)
transition_threshold = 5   # The cutoff value for the transition matrix
final_policies = 1         # The number of policies we would like to end up with
state_count = 750          # The number of distinct states
action_count = 5           # Number of actions per state (reccommended 2 to 10)
crossval_iter = 10         # Number of crossvalidation runs (Default is 80% Train, 20% Test)

In [4]:
###
# Data structures to hold our interim data
###

# Create the structures and fill them with NaN values
optimal_actions = np.empty((state_count + 2, mdp_count,))  # Not sure the significance of the 2 yet
optimal_actions[:] = np.nan


model_data = np.empty((mdp_count*2, 30,))
model_data[:] = np.nan

bestmodels_data = np.empty((mdp_count, 15))

In [5]:
# Grab list of unique patient ICU stay IDs
icu_ids = patientdata['icustayid'].unique()
# Number of patients to be used for states
id_count = icu_ids.size
print(id_count)

# Create a data structure to representing all patients
patient_idxs = np.empty((id_count, mdp_count,))
patient_idxs[:] = np.nan

21463


In [6]:
# All our columns are broken up into 3 distinct categories:
# 1. Binary values (0 or 1)
# 2. Standard Ranges (Plain old Integers + Decimals)
# 3. Logarthmic Values (columnvalue = log(columnvalue))

colbin = ['gender','mechvent','max_dose_vaso','re_admission', 'qSOFAFlag', 'SOFAFlag']
colnorm = ['age','Weight_kg','GCS','HR','SysBP','MeanBP','DiaBP','RR','Temp_C','FiO2_1',
    'Potassium','Sodium','Chloride','Glucose','Magnesium','Calcium',
    'Hb','WBC_count','Platelets_count','PTT','PT','Arterial_pH','paO2','paCO2',
    'Arterial_BE','HCO3','Arterial_lactate','SOFA','SIRS','Shock_Index','PaO2_FiO2','cumulated_balance', 'qSOFA'];
collog=['SpO2','BUN','Creatinine','SGOT','SGPT','Total_bili','INR','input_total','input_4hourly','output_total','output_4hourly'];

# Create seperate dataframes for each of the columns
colbin_df = patientdata[colbin]
colnorm_df = patientdata[colnorm]
collog_df = patientdata[collog]

# Let's make sure we have what we need
print(colbin_df, "\n", colnorm_df, "\n", collog_df)


        gender  mechvent  max_dose_vaso  re_admission  qSOFAFlag  SOFAFlag
0            0         1            0.0             0          1         0
1            0         1            0.0             0          1         1
2            0         1            0.0             0          1         1
3            0         1            0.0             0          1         1
4            0         1            0.0             0          1         1
...        ...       ...            ...           ...        ...       ...
238325       0         0            0.0             0          1         0
238326       0         0            0.0             0          1         0
238327       0         0            0.0             0          1         0
238328       0         0            0.0             0          1         0
238329       0         0            0.0             0          1         0

[238330 rows x 6 columns] 
                 age  Weight_kg        GCS         HR       SysBP     Me

In [7]:
# Rearrange the dataframe in order of binary, normal, and log data from left to right
MIMIC_raw = pd.concat([colbin_df, colnorm_df, collog_df], axis=1)
print(MIMIC_raw) 

        gender  mechvent  max_dose_vaso  re_admission  qSOFAFlag  SOFAFlag  \
0            0         1            0.0             0          1         0   
1            0         1            0.0             0          1         1   
2            0         1            0.0             0          1         1   
3            0         1            0.0             0          1         1   
4            0         1            0.0             0          1         1   
...        ...       ...            ...           ...        ...       ...   
238325       0         0            0.0             0          1         0   
238326       0         0            0.0             0          1         0   
238327       0         0            0.0             0          1         0   
238328       0         0            0.0             0          1         0   
238329       0         0            0.0             0          1         0   

                age  Weight_kg        GCS         HR  ...    BU

In [8]:
# We want a Z-Score for every item. This a measure of variance to see how far a value is from the mean

# Scipy provides a library to execute this kind of thing
from scipy.stats import zscore
# We need to normalize binaries to -0.5 and 0.5 for later use
MIMIC_zscores = MIMIC_raw

# No need for the zscore algorithm here, -0.5 and 0.5 suffice
MIMIC_zscores[colbin] = MIMIC_zscores[colbin] - 0.5

# Recall these columns are logarithmic, so they needed converted back for proper Z-Scoring (+ 0.1 to avoid log(0))
MIMIC_zscores[collog] = np.log(MIMIC_zscores[collog] + 0.1).apply(zscore)

# Normal column requires no modifications. Z-Scores are calculated as normal
MIMIC_zscores[colnorm] = MIMIC_zscores[colnorm].apply(zscore)
print(MIMIC_zscores)

# We want Re Admission and fluid intake scaled Similarly to the other variables
MIMIC_zscores['re_admission'] = np.log(MIMIC_zscores['re_admission'] + 0.6)
# Apply a scalar to fluid intake
MIMIC_zscores['input_total'] = 2 * MIMIC_zscores['input_total']

        gender  mechvent  max_dose_vaso  re_admission  qSOFAFlag  SOFAFlag  \
0         -0.5       0.5           -0.5          -0.5        0.5      -0.5   
1         -0.5       0.5           -0.5          -0.5        0.5       0.5   
2         -0.5       0.5           -0.5          -0.5        0.5       0.5   
3         -0.5       0.5           -0.5          -0.5        0.5       0.5   
4         -0.5       0.5           -0.5          -0.5        0.5       0.5   
...        ...       ...            ...           ...        ...       ...   
238325    -0.5      -0.5           -0.5          -0.5        0.5      -0.5   
238326    -0.5      -0.5           -0.5          -0.5        0.5      -0.5   
238327    -0.5      -0.5           -0.5          -0.5        0.5      -0.5   
238328    -0.5      -0.5           -0.5          -0.5        0.5      -0.5   
238329    -0.5      -0.5           -0.5          -0.5        0.5      -0.5   

             age  Weight_kg       GCS        HR  ...       BUN 

In [53]:
### The main loop to generate all possible models

num_rows = id_count  # Total Number of Patients to divy data up
testing_flag = 1     # The random number we use to identify a patient used for testing

# TODO: Change this to 1 in MDP_COUNT
#for model in range(1, 2): #mdp_count):
train_ids = []       # A list containing all training ids from the icu_ids list
test_ids =[]         # A list containing all testing ids from the icu_ids list

# We want approximate 20% test, 80% train, so we random numbers 1-5
# 1s Represent data points that will be used to test, 2-5 will be used to train
group_ids = pd.DataFrame([int(np.floor(5 * np.random.random() + 1)) for i in range(1, id_count + 1)])
icu_pair_set = pd.concat([pd.DataFrame(icu_ids), group_ids], axis=1, sort=False)
icu_pair_set.columns = ['id', 'fil_val']
train_ids =  icu_pair_set[icu_pair_set['fil_val'] != testing_flag]
test_ids = icu_pair_set[icu_pair_set['fil_val'] == testing_flag]

# We want to insure that the testing patients + training patients = total patients
if (train_ids['id'].size + test_ids['id'].size) != id_count:
    print("The testing and training set do not add up to the total set")
    exit()

# Percentage for testing should be about 20%, Training about 80%
print("Testing Percentage: " + str((test_ids['id'].size / id_count)))
print("Training Percentage: " + str((train_ids['id'].size / id_count)))

# After grabbing all the IDs, we want to flag all the rows that are train or test
train_flag = patientdata['icustayid'].isin(train_ids['id'])
test_flag = patientdata['icustayid'].isin(test_ids['id'])

#Validating that all data is being selected, and that the train and test sets are perfect opposites
if patientdata['icustayid'].size != train_flag.size or not((train_flag.equals(~test_flag))):
    print("Not all rows were grabbed properly, there is something wrong with the split")
    exit()

Testing Percentage: 0.19945953501374458
Training Percentage: 0.8005404649862554


In [55]:
# Seperate the Z-Scores for the training set and the testing set
train_zscores = MIMIC_zscores[train_flag]
test_zscores = MIMIC_zscores[test_flag]

# Validate all data is selected
if(train_zscores.size + test_zscores.size != MIMIC_zscores.size):
    print("The Z-Scores are all evenly distributed")
    exit()

    
# The blocs of relevance in order based on the train and test set
# These will be used to build relevant data frames later down
train_blocs = patientdata[train_flag]['bloc']
test_blocs = patientdata[test_flag]['bloc']

# Doing the same with the patient ids
train_id_list = patientdata[train_flag]['icustayid']
test_id_list = patientdata[test_flag]['icustayid']

# Grabbing the boolean values for the patients who died within 90 days in the training set
train_90d = patientdata[train_flag]['mortality_90d']

In [65]:
# Next, we want to sample the existing training set to only pick cluster_sample percent to use

# We want to flag all the data points in the train_zscores set that will be used to create the MDP

# Note: len(train_zscores.index) is the fastest way to get the number of rows in a dataframe in pandas

# Additional Note: np.floor(np.random.random() + cluster_sample) is a computationally speedy way to get an approximate
# percentage sample from a proportion value (cluster_sample). If cluster sample is 0.25, approximately 25% of the values
# will be flagged as a 1, making it into the sample training set
sample_train_flags = [bool(np.floor(np.random.random() + cluster_sample)) for i in range(len(train_zscores.index))]

# It's good to know how much of the data was selected as sample
print("Proportion of Train Data used for the Sample: " + str(sample_train_flags.count(True)/len(sample_train_flags)))

# The actual set to use
sample_train_set = train_zscores[sample_train_flags]

Proportion of Data used for the Sample: 0.251288727440708


In [72]:
# In order to prepare a proper set of states, we want to use k-means clustering to group various patients into 
# distinct states based on Z-Scores

# K-Means or K-Means++ is a technique used to condense very diverse and sparse data into similar groups called 'clusters'
# The K-means algorithm will create k clusters from N data points. In the case of this research,
# the algorithm divides patients into groups that have similar data (age, blood pressure, etc..) and creates a faux 'point'
# at the center of that particular clustering of data


# Skikit offers a solution to perform K-Means++ clustering
from sklearn.cluster import KMeans
# The KMeans takes three 'settings' arguments
# 1. n_clusters: The number of clusters (later to be used as states), that we desire the algorithm to produce
# this value has been preset to state_count which is 750
# 2. max_iter: How many times each round of k-means clustering will make adjustments, set at 10,000 in my case
# 3. n_init: The number of max_iter batches that will be conducted in a row. The best of these will be chosen
# and saved in the variable clusters_models
clusters_models = KMeans(n_clusters=state_count, max_iter=10000, n_init=clustering_iter).fit(sample_train_set)

In [80]:
print(clusters_models.labels_)
print(clusters_models.cluster_centers_)

[321 229 527 ... 137 573 155]
[[-0.2254902  -0.32352941 -0.42817647 ... -0.07048865  0.20045151
   0.04158715]
 [ 0.09405941 -0.04455446 -0.47672277 ...  0.66545596  0.32843116
   0.42501051]
 [ 0.04545455 -0.15454545 -0.49425455 ...  0.30664516  0.28904419
   0.42852166]
 ...
 [-0.34615385 -0.38461538 -0.46365385 ... -1.47820946 -0.18871191
   0.04611651]
 [ 0.01351351 -0.01351351 -0.46678378 ...  0.34582232  0.109389
   0.14752571]
 [ 0.5        -0.5        -0.5        ... -1.52106461 -2.31206609
  -1.85290276]]


In [83]:
# Python has object serialization to make write/reads fasters, in the form of pickle
import pickle

# Save the important data (clusters created as a result of the K-Means operations)
# This process takes quite a while. This will provide a checkpoint to decrease compute time
# until the code is put into dev.
with open('cluster_labels.txt', 'wb') as fp:
    pickle.dump(clusters_models.labels_, fp)
with open('cluster_centers.txt', 'wb') as fp:
    pickle.dump(clusters_models.cluster_centers_, fp)