# Development of machine learning models to process Electronic Health Records – Explainable Models

### Testing Notebook
Lok Hang Toby Lee (2431180L)

## Configuration Step
1. Imports
2. Set database configurations
3. Connect to MIMIC-III local postgreSQL database

In [1]:
# Imports:
import numpy as np
import pandas as pd
import sys
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.colors as mc
import colorsys
import psycopg2
import os
import yaml
%matplotlib inline


#pg_ctl.exe restart -D "E:\PostgreSQL\data"

# Configuration:
sqluser = 'postgres'
dbname = 'mimic'
password='postgres'
schema_name = 'public, mimic, mimiciii;'

# Connect to MIMIC-III:
con = psycopg2.connect(dbname=dbname, user=sqluser, password=password)
cur = con.cursor()
cur.execute('SET search_path to ' + schema_name)

# Proposal 1: Train a machine learning model to predict Mortality rate given features from health record

Features (x):
1. ICU stay days
2. Reason of ICU stay (null if 0 day stays)
3. Age
4. Gender
4. Heart rate
5. Height

Label (y): Mortality of patient

Aims:
1. If the patient did not stay in ICU and have normal features, the mortality should be low. 
2. Find correlation from machine learning if the selected features are significant for mortality rate



## Extracting Data and features for Machine Learning:

- Static features: Age, sex, ethnicity
- Time series features: Mechanical Ventilation, Colloid Bolus, Heart Rate, Glucose

In [2]:
# Imports:
import psycopg2
import numpy as np
import pandas as pd
import os
import yaml

### Study cohort selection
- Only first ICU admissions that took at least a day and less than 10 days
- Adult patients only (age >= 15)


In [4]:
# Settings for the query:
min_age = 15
limit_population = 0 # if we want to run the query for a small number of patients (for debugging)
if limit_population > 0:
    limit = 'LIMIT ' + str(limit_population)
else:
    limit = ''

In [5]:
query = """
with patient_and_icustay_details as (
    SELECT distinct
        p.gender, p.dob, p.dod, s.*, a.admittime, a.dischtime, a.deathtime, a.ethnicity, a.diagnosis, a.hospital_expire_flag,
        DENSE_RANK() OVER (PARTITION BY a.subject_id ORDER BY a.admittime) AS hospstay_seq,
        round((EXTRACT(EPOCH FROM (a.dischtime-a.admittime))/60/60/24) :: NUMERIC, 4) as hospital_los,
        DENSE_RANK() OVER (PARTITION BY s.hadm_id ORDER BY s.intime) AS icustay_seq,
        DATE_PART('year', s.intime) - DATE_PART('year', p.dob) as admission_age,
        DATE_PART('day', s.outtime - s.intime) as los_icu
    FROM patients p 
        INNER JOIN icustays s ON p.subject_id = s.subject_id
        INNER JOIN admissions a ON s.hadm_id = a.hadm_id 
    WHERE s.first_careunit NOT like 'NICU'
        and s.hadm_id is not null and s.icustay_id is not null
        and (s.outtime >= (s.intime + interval '12 hours'))
        and (s.outtime <= (s.intime + interval '240 hours'))
    ORDER BY s.subject_id 
)
SELECT * 
FROM patient_and_icustay_details 
WHERE hospstay_seq = 1
    and icustay_seq = 1
    and admission_age >=  """ + str(min_age) + """
    and los_icu >= 0.5
""" + str(limit)
patients_data = pd.read_sql_query('SET search_path to ' + schema_name + query, con)

# Save result:
patients_data.to_csv('static_data.csv')

In [3]:
patients_data = pd.read_csv('static_data.csv')  

patients_data 


Unnamed: 0.1,Unnamed: 0,gender,dob,dod,row_id,subject_id,hadm_id,icustay_id,dbsource,first_careunit,...,dischtime,deathtime,ethnicity,diagnosis,hospital_expire_flag,hospstay_seq,hospital_los,icustay_seq,admission_age,los_icu
0,0,M,2025-04-11 00:00:00,2102-06-14 00:00:00,2,3,145834,211552,carevue,MICU,...,2101-10-31 13:58:00,,WHITE,HYPOTENSION,0,1,10.7847,1,76.0,6.0
1,1,F,2143-05-12 00:00:00,,3,4,185777,294638,carevue,MICU,...,2191-03-23 18:41:00,,WHITE,"FEVER,DEHYDRATION,FAILURE TO THRIVE",0,1,7.7590,1,48.0,1.0
2,2,F,2109-06-21 00:00:00,,5,6,107064,228232,carevue,SICU,...,2175-06-15 16:00:00,,WHITE,CHRONIC RENAL FAILURE/SDA,0,1,16.3646,1,66.0,3.0
3,3,M,2108-01-26 00:00:00,2149-11-14 00:00:00,9,9,150750,220597,carevue,MICU,...,2149-11-14 10:15:00,2149-11-14 10:15:00,UNKNOWN/NOT SPECIFIED,HEMORRHAGIC CVA,1,1,4.8813,1,41.0,5.0
4,4,F,2128-02-22 00:00:00,2178-11-14 00:00:00,11,11,194540,229441,carevue,SICU,...,2178-05-11 19:00:00,,WHITE,BRAIN MASS,0,1,25.5292,1,50.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30058,30058,M,2114-09-29 00:00:00,,61527,99983,117390,286606,metavision,CCU,...,2193-04-29 13:30:00,,UNKNOWN/NOT SPECIFIED,ST ELEVATION MYOCARDIAL INFARCTION;CORONARY AR...,0,1,3.0799,1,79.0,1.0
30059,30059,M,2137-04-07 00:00:00,,61529,99991,151118,226241,metavision,TSICU,...,2185-01-05 12:15:00,,WHITE,DIVERTICULITIS/SDA,0,1,12.1563,1,47.0,3.0
30060,30060,F,2078-10-17 00:00:00,,61530,99992,197084,242052,metavision,MICU,...,2144-07-28 17:56:00,,WHITE,RETROPERITONEAL HEMORRHAGE,0,1,2.9951,1,66.0,1.0
30061,30061,F,2058-05-29 00:00:00,2147-09-29 00:00:00,61531,99995,137810,229633,metavision,CSRU,...,2147-02-11 13:15:00,,WHITE,ABDOMINAL AORTIC ANEURYSM/SDA,0,1,3.2188,1,89.0,2.0


In [7]:
print('Total hospital mortality numbers: ', patients_data.drop_duplicates(['hadm_id']).groupby('hospital_expire_flag')
      .hospital_expire_flag.count())

print('Median of length of stay in hospital (in days): ', patients_data.drop_duplicates(['hadm_id']).hospital_los.size)
print('mortality size', patients_data.drop_duplicates(['hadm_id']).hospital_expire_flag.size)

Total hospital mortality numbers:  hospital_expire_flag
0    27132
1     2931
Name: hospital_expire_flag, dtype: int64
Median of length of stay in hospital (in days):  30063
mortality size 30063


In [12]:
#Length of Stay
query = \
"""
SELECT p.subject_id, i.first_careunit, i.hadm_id,
round((EXTRACT(EPOCH FROM (a.admittime-p.dob))/60/60/24/365.242) :: NUMERIC, 4) as age,
round((EXTRACT(EPOCH FROM (a.dischtime-a.admittime))/60/60/24) :: NUMERIC, 4) as hospital_los
FROM mimiciii.patients p
INNER JOIN mimiciii.icustays i ON p.subject_id = i.subject_id
INNER JOIN mimiciii.admissions a ON a.hadm_id = i.hadm_id
WHERE round((EXTRACT(EPOCH FROM (a.admittime-p.dob))/60/60/24/365.242) :: NUMERIC, 4) >= 16;
"""

length_of_stay = pd.read_sql_query(query,con)

In [13]:
length_of_stay

Unnamed: 0,subject_id,first_careunit,hadm_id,age,hospital_los
0,268,MICU,110404,65.9770,6.5938
1,269,MICU,106296,40.1007,22.2889
2,270,CCU,188028,80.0778,3.7535
3,271,MICU,173727,45.6869,12.8778
4,272,CCU,164716,67.0976,7.7438
...,...,...,...,...,...
53418,94944,CSRU,143774,77.1126,8.9222
53419,94950,CCU,123750,300.0037,4.4813
53420,94953,SICU,196881,53.0944,1.2313
53421,94954,CSRU,118475,67.8408,7.4111


In [14]:
#Mortality numbers
query = \
"""
SELECT i.hadm_id, i.first_careunit, a.hospital_expire_flag,
round((EXTRACT(EPOCH FROM (a.admittime-p.dob))/60/60/24/365.242) :: NUMERIC, 4) as age
FROM mimiciii.patients p
INNER JOIN mimiciii.icustays i ON p.subject_id = i.subject_id
INNER JOIN mimiciii.admissions a ON i.hadm_id = a.hadm_id
WHERE round((EXTRACT(EPOCH FROM (a.admittime-p.dob))/60/60/24/365.242) :: NUMERIC, 4) >= 16;
"""

mortality_numbers = pd.read_sql_query(query,con)

In [15]:
mortality_numbers


Unnamed: 0,hadm_id,first_careunit,hospital_expire_flag,age
0,110404,MICU,1,65.9770
1,106296,MICU,0,40.1007
2,188028,CCU,0,80.0778
3,173727,MICU,0,45.6869
4,164716,CCU,0,67.0976
...,...,...,...,...
53418,143774,CSRU,0,77.1126
53419,123750,CCU,0,300.0037
53420,196881,SICU,0,53.0944
53421,118475,CSRU,0,67.8408


# Training the model
1. Train the model using different methods (Linear regression)
2. Train and combine the features
3. Note the effectiveness of each feature (by analysing covarinace matrix)
4. Compare the performance of models by f1 score
5. Principle component analysis to visualise the results
6. Select the final model 

In [16]:
x=patients_data['hospital_los']
y=patients_data['hospital_expire_flag']

In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
from sklearn.model_selection import StratifiedKFold, train_test_split
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import seaborn as sn

In [103]:
def showResults(test, pred, model_name):
    accuracy = accuracy_score(test, pred)
    precision= precision_score(test, pred, average='macro')
    recall = recall_score(test, pred, average = 'macro')
    f1score_macro = f1_score(test, pred, average='macro') 
    f1score_micro = f1_score(test, pred, average='micro') 
    print("Accuracy  : {}".format(accuracy))
    print("Precision : {}".format(precision))
    print("Recall : {}".format(recall))
    print("f1score macro : {}".format(f1score_macro))
    print("f1score micro : {}".format(f1score_micro))
    cm=confusion_matrix(test, pred, labels=[1,2,3,4,5,6,7,8])
    # return (model_name, round(accuracy,3), round(precision,3) , round(recall,3) , round(f1score_macro,3), 
    #         round(f1score_micro, 3), cm)

### Sampling methods to handle class imbalance

In [58]:
def undersample_majority(x_train, y_train):
    
    # Separate the positive and negative x/y data:
    pos_features = x_train[y_train == 1]
    neg_features = x_train[y_train == 0]
    pos_labels = y_train[y_train == 1]
    neg_labels = y_train[y_train == 0]
    
    # Count the number of negative data points, and select that many from the positive data points. This means that 
    # positive samples will be 'copied' or repeated and hence occur multiple times in the training data:
    ids = np.arange(len(neg_features))
    choices = np.random.choice(ids, len(pos_features))
    res_neg_features = neg_features[choices]
    res_neg_labels = neg_labels[choices]
    
    # Combine the resamples and negative data points, and shuffle:
    resampled_features = np.concatenate([res_neg_features, pos_features], axis=0)
    resampled_labels = np.concatenate([res_neg_labels, pos_labels], axis=0)
    order = np.arange(len(resampled_labels))
    np.random.shuffle(order)
    resampled_features = resampled_features[order]
    resampled_labels = resampled_labels[order]
    return resampled_features, resampled_labels

### Logistic Regression Base model

In [118]:
df_train, df_test = train_test_split(patients_data, 
                                     train_size = 0.8, 
                                     test_size = 0.2, 
                                     random_state = 100)

numeric_vars = ['hospital_los','admission_age','los_icu']

# indices_to_remove = []
# for i, row in df_train.iterrows():
#     if row['los'] < 48:
#         indices_to_remove.append(i)

# df_train = df_train.reset_index().set_index('icustay_id').drop(indices_to_remove, axis = 0)
# df_test = df_test.drop(indices_to_remove, axis = 0)

x_train = df_train[numeric_vars].to_numpy()
y_train = df_train['hospital_expire_flag']


#x_train, y_train = undersample_majority(x_train, y_train)

x_test = df_test[numeric_vars]
y_test = df_test['hospital_expire_flag']



logreg = LogisticRegression(penalty = 'l2', C = 1, random_state = 0, solver='lbfgs')
logreg.fit(x_train, y_train)

predction = logreg.predict(x_test)

showResults(y_test,predction,logreg)




Accuracy  : 0.9043738566439381
Precision : 0.7023888796404195
Recall : 0.502332858947503
f1score macro : 0.4800433713885687
f1score micro : 0.904373856643938


NameError: name 'y_true_logreg_combined' is not defined

### K-fold Logistic Regression

In [116]:
from sklearn.model_selection import KFold

numeric_vars = ['hospital_los','admission_age','los_icu']

X = patients_data[numeric_vars].to_numpy()

y = patients_data['hospital_expire_flag']


number_of_kfolds = 5
kf = KFold(n_splits=number_of_kfolds)
kf.get_n_splits(X)

predictionSum = 0
Score = 0

for train_index, test_index in kf.split(X):
  train_features, test_features = X[train_index], X[test_index]
  train_labels, test_labels = y[train_index], y[test_index]

  logreg = LogisticRegression(solver='lbfgs')
  logreg.fit(train_features, train_labels)
  
  Score += (logreg.score(test_features,test_labels))

print("score:", Score/number_of_kfolds)



score: 0.9025717897316957


### StratifiedKFold Logistic Regression