# Development of machine learning models to process Electronic Health Records – Explainable Models

### Testing Notebook
Lok Hang Toby Lee (2431180L)

## Configuration Step
1. Imports
2. Set database configurations
3. Connect to MIMIC-III local postgreSQL database

In [2]:
# Imports:
import numpy as np
import pandas as pd
import sys
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.colors as mc
import colorsys
import psycopg2
import os
import yaml
%matplotlib inline

# Configuration:
sqluser = 'postgres'
dbname = 'mimic'
password='postgres'
schema_name = 'public, mimic, mimiciii;'

# Connect to MIMIC-III:
con = psycopg2.connect(dbname=dbname, user=sqluser, password=password)
cur = con.cursor()
cur.execute('SET search_path to ' + schema_name)

# Proposal 1: Train a machine learning model to predict Mortality rate given features from health record

Features (x):
1. ICU stay days
2. Reason of ICU stay (null if 0 day stays)
3. Age
4. Gender
4. Heart rate
5. Height

Label (y): Mortality of patient

Aims:
1. If the patient did not stay in ICU and have normal features, the mortality should be low. 
2. Find correlation from machine learning if the selected features are significant for mortality rate



## Extract Data for Machine Learning:

1. Extract the data for the features from the database
2. Group up the data into a single pandas array
3. split the pandas array into train validation test set

In [3]:
# Imports:
import psycopg2
import numpy as np
import pandas as pd
import os
import yaml

### Study cohort selection
- Only first ICU admissions that took at least a day and less than 10 days
- Adult patients only (age >= 15)

In [4]:
# Settings for the query:
min_age = 15
limit_population = 0 # if we want to run the query for a small number of patients (for debugging)
if limit_population > 0:
    limit = 'LIMIT ' + str(limit_population)
else:
    limit = ''

In [51]:
query = """
with patient_and_icustay_details as (
    SELECT distinct
        p.gender, p.dob, p.dod, s.*, a.admittime, a.dischtime, a.deathtime, a.ethnicity, a.diagnosis, a.hospital_expire_flag,
        DENSE_RANK() OVER (PARTITION BY a.subject_id ORDER BY a.admittime) AS hospstay_seq,
        round((EXTRACT(EPOCH FROM (a.dischtime-a.admittime))/60/60/24) :: NUMERIC, 4) as hospital_los,
        DENSE_RANK() OVER (PARTITION BY s.hadm_id ORDER BY s.intime) AS icustay_seq,
        DATE_PART('year', s.intime) - DATE_PART('year', p.dob) as admission_age,
        DATE_PART('day', s.outtime - s.intime) as los_icu
    FROM patients p 
        INNER JOIN icustays s ON p.subject_id = s.subject_id
        INNER JOIN admissions a ON s.hadm_id = a.hadm_id 
    WHERE s.first_careunit NOT like 'NICU'
        and s.hadm_id is not null and s.icustay_id is not null
        and (s.outtime >= (s.intime + interval '12 hours'))
        and (s.outtime <= (s.intime + interval '240 hours'))
    ORDER BY s.subject_id 
)
SELECT * 
FROM patient_and_icustay_details 
WHERE hospstay_seq = 1
    and icustay_seq = 1
    and admission_age >=  """ + str(min_age) + """
    and los_icu >= 0.5
""" + str(limit)
patients_data = pd.read_sql_query('SET search_path to ' + schema_name + query, con)

# Save result:
patients_data.to_csv('static_data.csv')

In [52]:
patients_data

Unnamed: 0,gender,dob,dod,row_id,subject_id,hadm_id,icustay_id,dbsource,first_careunit,last_careunit,...,dischtime,deathtime,ethnicity,diagnosis,hospital_expire_flag,hospstay_seq,hospital_los,icustay_seq,admission_age,los_icu
0,M,2025-04-11,2102-06-14,2,3,145834,211552,carevue,MICU,MICU,...,2101-10-31 13:58:00,NaT,WHITE,HYPOTENSION,0,1,10.7847,1,76.0,6.0
1,F,2143-05-12,NaT,3,4,185777,294638,carevue,MICU,MICU,...,2191-03-23 18:41:00,NaT,WHITE,"FEVER,DEHYDRATION,FAILURE TO THRIVE",0,1,7.7590,1,48.0,1.0
2,F,2109-06-21,NaT,5,6,107064,228232,carevue,SICU,SICU,...,2175-06-15 16:00:00,NaT,WHITE,CHRONIC RENAL FAILURE/SDA,0,1,16.3646,1,66.0,3.0
3,M,2108-01-26,2149-11-14,9,9,150750,220597,carevue,MICU,MICU,...,2149-11-14 10:15:00,2149-11-14 10:15:00,UNKNOWN/NOT SPECIFIED,HEMORRHAGIC CVA,1,1,4.8813,1,41.0,5.0
4,F,2128-02-22,2178-11-14,11,11,194540,229441,carevue,SICU,SICU,...,2178-05-11 19:00:00,NaT,WHITE,BRAIN MASS,0,1,25.5292,1,50.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30058,M,2114-09-29,NaT,61527,99983,117390,286606,metavision,CCU,CCU,...,2193-04-29 13:30:00,NaT,UNKNOWN/NOT SPECIFIED,ST ELEVATION MYOCARDIAL INFARCTION;CORONARY AR...,0,1,3.0799,1,79.0,1.0
30059,M,2137-04-07,NaT,61529,99991,151118,226241,metavision,TSICU,TSICU,...,2185-01-05 12:15:00,NaT,WHITE,DIVERTICULITIS/SDA,0,1,12.1563,1,47.0,3.0
30060,F,2078-10-17,NaT,61530,99992,197084,242052,metavision,MICU,MICU,...,2144-07-28 17:56:00,NaT,WHITE,RETROPERITONEAL HEMORRHAGE,0,1,2.9951,1,66.0,1.0
30061,F,2058-05-29,2147-09-29,61531,99995,137810,229633,metavision,CSRU,CSRU,...,2147-02-11 13:15:00,NaT,WHITE,ABDOMINAL AORTIC ANEURYSM/SDA,0,1,3.2188,1,89.0,2.0


In [53]:
print('Total hospital mortality numbers: ', patients_data.drop_duplicates(['hadm_id']).groupby('hospital_expire_flag')
      .hospital_expire_flag.count())

print('Median of length of stay in hospital (in days): ', patients_data.drop_duplicates(['hadm_id']).hospital_los.size)
print('mortality size', patients_data.drop_duplicates(['hadm_id']).hospital_expire_flag.size)

Total hospital mortality numbers:  hospital_expire_flag
0    27132
1     2931
Name: hospital_expire_flag, dtype: int64
Median of length of stay in hospital (in days):  30063
mortality size 30063


## Linear Regression Base model

In [84]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [127]:
df_train, df_test = train_test_split(patients_data, 
                                     train_size = 0.7, 
                                     test_size = 0.3, 
                                     random_state = 100)

numeric_vars = ['hospital_los','admission_age','los_icu']

x_train = df_train[numeric_vars]
y_train = df_train['hospital_expire_flag']

x_test = df_test[numeric_vars]
y_test = df_test['hospital_expire_flag']


lm = LinearRegression()
lm.fit(x_train, y_train)
lm.score(x_test,y_test)

prediction=lm.predict(x_test)

output

array([0.17428963, 0.09767147, 0.05698387, ..., 0.09275903, 0.07288086,
       0.12493275])

In [115]:
y= y_train.to_numpy().reshape(-1,1)

y.shape

(21044, 1)

In [33]:
#Length of Stay
query = \
"""
SELECT p.subject_id, i.first_careunit, i.hadm_id,
round((EXTRACT(EPOCH FROM (a.admittime-p.dob))/60/60/24/365.242) :: NUMERIC, 4) as age,
round((EXTRACT(EPOCH FROM (a.dischtime-a.admittime))/60/60/24) :: NUMERIC, 4) as hospital_los
FROM mimiciii.patients p
INNER JOIN mimiciii.icustays i ON p.subject_id = i.subject_id
INNER JOIN mimiciii.admissions a ON a.hadm_id = i.hadm_id
WHERE round((EXTRACT(EPOCH FROM (a.admittime-p.dob))/60/60/24/365.242) :: NUMERIC, 4) >= 16;
"""

length_of_stay = pd.read_sql_query(query,con)

In [35]:
length_of_stay

Unnamed: 0,subject_id,first_careunit,hadm_id,age,hospital_los
0,268,MICU,110404,65.9770,6.5938
1,269,MICU,106296,40.1007,22.2889
2,270,CCU,188028,80.0778,3.7535
3,271,MICU,173727,45.6869,12.8778
4,272,CCU,164716,67.0976,7.7438
...,...,...,...,...,...
53418,94944,CSRU,143774,77.1126,8.9222
53419,94950,CCU,123750,300.0037,4.4813
53420,94953,SICU,196881,53.0944,1.2313
53421,94954,CSRU,118475,67.8408,7.4111


In [8]:
#Mortality numbers
query = \
"""
SELECT i.hadm_id, i.first_careunit, a.hospital_expire_flag,
round((EXTRACT(EPOCH FROM (a.admittime-p.dob))/60/60/24/365.242) :: NUMERIC, 4) as age
FROM mimiciii.patients p
INNER JOIN mimiciii.icustays i ON p.subject_id = i.subject_id
INNER JOIN mimiciii.admissions a ON i.hadm_id = a.hadm_id
WHERE round((EXTRACT(EPOCH FROM (a.admittime-p.dob))/60/60/24/365.242) :: NUMERIC, 4) >= 16;
"""

mortality_numbers = pd.read_sql_query(query,con)

In [34]:
mortality_numbers


Unnamed: 0,hadm_id,first_careunit,hospital_expire_flag,age
0,110404,MICU,1,65.9770
1,106296,MICU,0,40.1007
2,188028,CCU,0,80.0778
3,173727,MICU,0,45.6869
4,164716,CCU,0,67.0976
...,...,...,...,...
53418,143774,CSRU,0,77.1126
53419,123750,CCU,0,300.0037
53420,196881,SICU,0,53.0944
53421,118475,CSRU,0,67.8408


## Training the model
1. Train the model using different methods (Linear regression)
2. Train and combine the features
3. Note the effectiveness of each feature (by analysing covarinace matrix)
4. Compare the performance of models by f1 score
5. Principle component analysis to visualise the results
6. Select the final model 

In [59]:
x=patients_data['hospital_los']
y=patients_data['hospital_expire_flag']

In [60]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, train_test_split
from matplotlib import pyplot as plt
import seaborn as sn

In [62]:
def undersample_majority(x_train, y_train):
    
    # Separate the positive and negative x/y data:
    pos_features = x_train[y_train == 1]
    neg_features = x_train[y_train == 0]
    pos_labels = y_train[y_train == 1]
    neg_labels = y_train[y_train == 0]
    
    # Count the number of negative data points, and select that many from the positive data points. This means that 
    # positive samples will be 'copied' or repeated and hence occur multiple times in the training data:
    ids = np.arange(len(neg_features))
    choices = np.random.choice(ids, len(pos_features))
    res_neg_features = neg_features[choices]
    res_neg_labels = neg_labels[choices]
    
    # Combine the resamples and negative data points, and shuffle:
    resampled_features = np.concatenate([res_neg_features, pos_features], axis=0)
    resampled_labels = np.concatenate([res_neg_labels, pos_labels], axis=0)
    order = np.arange(len(resampled_labels))
    np.random.shuffle(order)
    resampled_features = resampled_features[order]
    resampled_labels = resampled_labels[order]
    return resampled_features, resampled_labels

In [63]:
# Stratified 5-fold cross-validation:
kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)
predictions_logreg = []
y_true_logreg = []
for train_index, test_index in kf.split(x, y):
    
    # Split in training and test set, and split the training further in training and validation sets 
    x_train, x_val, y_train, y_val = train_test_split(x[train_index], y[train_index], test_size=0.125, random_state=0, 
                                            stratify=y[train_index])
    x_test, y_test = x[test_index], y[test_index]
    
    x_train, y_train = undersample_majority(x_train, y_train)
    
    # Reshape:
    x_train_lr = np.reshape(x_train, (x_train.shape[0], -1))
    x_val_lr = np.reshape(x_val, (x_val.shape[0], -1))
    x_test_lr = np.reshape(x_test, (x_test.shape[0], -1))
    
    # Initialize and fit a model:
    logreg = LogisticRegression(penalty = 'l2', C = 1, random_state = 0)
    logreg.fit(x_train_lr, y_train)
    
    # Make predictions:
    pred = logreg.predict_proba(x_test_lr)
    predictions_logreg.append(list(pred))
    y_true_logreg.append(list(y_test))

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


Exception: Data must be 1-dimensional