# Triage MIMIC - Emergency Department

This analysis relies on the emergency data from the MIMIC IV dataset (Refer to https://physionet.org/content/mimic-iv-ed/1.0/ for the original dataset.) 

First, you need to download the data from Physionet website, following the instructions on the website.

`
wget -r -N -c -np --user USERNAME --ask-password https://physionet.org/files/mimic-iv-ed/1.0/
`

This will result in a `physionet.org` folder in which the `ed` directory will contains all relevant data.

In [1]:
path = 'physionet.org/files/mimic-iv-ed/1.0/ed/'

##### Extract data of interest

In [2]:
import pandas as pd
import os

In [3]:
# Open data
triage = pd.read_csv(os.path.join(path, 'triage.csv.gz'), index_col = [0, 1])
ed = pd.read_csv(os.path.join(path, 'edstays.csv.gz'), index_col = [0, 2], parse_dates = ['intime', 'outtime'])

In [4]:
# Remove unnecessary columns and datapoints with any missing data
triage = triage.drop(columns = 'chiefcomplaint')
triage = triage.dropna(0, 'any')
triage

  triage = triage.dropna(0, 'any')


Unnamed: 0_level_0,Unnamed: 1_level_0,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity
subject_id,stay_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
15585360,37573921,97.0,87.0,18.0,100.0,150.0,71.0,10.0,3.0
15248757,32172727,97.1,112.0,20.0,100.0,147.0,97.0,8.0,4.0
16648037,38946064,98.5,59.0,18.0,99.0,160.0,86.0,2.0,2.0
13492931,39828574,100.6,90.0,16.0,96.0,107.0,55.0,0.0,3.0
11475777,38193311,97.1,85.0,16.0,100.0,138.0,86.0,7.0,3.0
...,...,...,...,...,...,...,...,...,...
15913671,35574167,98.0,82.0,15.0,98.0,127.0,86.0,8.0,3.0
14913519,33280070,97.1,104.0,18.0,97.0,90.0,57.0,0.0,2.0
13537748,39146222,97.1,56.0,20.0,100.0,177.0,92.0,6.0,2.0
15608541,39109339,97.6,92.0,18.0,98.0,197.0,73.0,0.0,4.0


In [5]:
# Nurse assignment
# Expertise and tiredness might play a role here and we assign the day of admission as proxies of these dimensions
triage['nurse'] = ed.intime.dt.day_of_week[triage.index]

In [6]:
# Acuity binarization - D
# Human decision
triage['D'] = triage['acuity'] <= 2

In [7]:
# Outcome - Y1
# Defined as admission to the hospital
triage['Y1'] = ed.hadm_id.isna()[triage.index]

In [8]:
# Outcome - Y2
# Defined as abnormal vital signs using Emergency Severity Index
triage['Y2'] = (triage.o2sat < 92) | (triage.resprate > 20) | (triage.heartrate > 100)

In [9]:
# Concept - Yc
# Yc is definied as the union of Y1 and Y2
triage['YC'] = triage['Y1'] | triage['Y2']

In [10]:
triage.to_csv('triage_clean.csv')

### Verification

We study what proportion of the population have these characteristics.

In [11]:
# Nurse assignment
triage['nurse'].value_counts().sort_index() / len(triage)

0    0.143822
1    0.142558
2    0.142089
3    0.143345
4    0.142443
5    0.142326
6    0.143418
Name: nurse, dtype: float64

In [12]:
# Human decision D - Acuity
triage['D'].mean()

0.36397630728730407

In [13]:
# Outcome - Y1
triage['Y1'].mean()

0.5445559610705596

In [14]:
# Outcome - Y2
triage['Y2'].mean()

0.20116369510589924

In [15]:
# Concept - Yc
(triage['Y1'] & triage['Y2']).sum() / triage['Y2'].sum()

0.4336381887129155

----------

# Semi - synthetic labels for scenarios

We create semi synthetic labels using tree-based models to allow more control on the consistency scenarios

In [16]:
from sklearn.metrics import roc_auc_score, precision_score
from sklearn.tree import DecisionTreeClassifier
import numpy as np

In [17]:
# Model for Y1
model_y1 = DecisionTreeClassifier(max_depth = 9, random_state = 42)
model_y1.fit(triage.iloc[:, :7], triage['Y1'])
synth_y1 = model_y1.predict_proba(triage.iloc[:, :7])[:, 1]
roc_auc_score(triage['Y1'], synth_y1)

0.6871421377988849

In [18]:
# Model for Y2
model_y2 = DecisionTreeClassifier(max_depth = 2, random_state = 42)
model_y2.fit(triage.iloc[:, :7], triage['Y2'])
synth_y2 = model_y2.predict_proba(triage.iloc[:, :7])[:, 1]
roc_auc_score(triage['Y2'], synth_y2)

0.9928681190670929

In [19]:
# Update labels
triage['Y1'] = synth_y1 > 0.5
triage['Y2'] = synth_y2 > 0.5

In [20]:
# Model for D : Use a model for Yc and chance some of the leaved decision with random noise
model_yc = DecisionTreeClassifier(max_depth = 4, random_state = 42)
model_yc .fit(triage.iloc[:, :7], triage['Y1'] | triage['Y2'])
synth_yc = model_yc.predict_proba(triage.iloc[:, :7])[:, 1]
roc_auc_score(triage['Y1'] | triage['Y2'], synth_yc)

0.9659704778145679

In [21]:
# Compute last leaves of each point
final_leave_yc = model_yc.apply(triage.iloc[:, :7])

# Compute precision in Y2 for each leave
for leaf in np.unique(final_leave_yc):
    selection = final_leave_yc == leaf
    print('{} -> {:.2f} precision - {} patients'.format(leaf, 
            precision_score(triage['Y2'][selection], synth_yc[selection] > 0.5), selection.sum()))

4 -> 0.00 precision - 50679 patients
5 -> 1.00 precision - 2756 patients
6 -> 1.00 precision - 12646 patients
9 -> 0.00 precision - 26867 patients
10 -> 1.00 precision - 7321 patients
12 -> 0.00 precision - 4127 patients
13 -> 0.21 precision - 60272 patients
17 -> 0.00 precision - 10747 patients
18 -> 1.00 precision - 1196 patients
20 -> 0.13 precision - 17501 patients
21 -> 0.00 precision - 949 patients
24 -> 0.15 precision - 177069 patients
25 -> 0.43 precision - 25450 patients
27 -> 0.00 precision - 2924 patients
28 -> 1.00 precision - 632 patients


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
# Change prediction with noise for leaves with high precision
leaves_to_update = [10, 18] # Keep expert consistency in 5, 6, 28


eps = 1 # Noise to add
for leaf in leaves_to_update:
    selection = final_leave_yc == leaf
    noise = (np.random.random(np.sum(selection)) - 0.5) * 2 * eps
    synth_yc[selection] = np.minimum(np.maximum(synth_yc[selection] + noise, 0), 1)
    print(leaf, np.mean(synth_yc[selection] > 0.5))

10 0.7553612894413332
18 0.7474916387959866


In [23]:
triage['D'] = synth_yc

In [24]:
triage.to_csv('triage_semi_synthetic.csv')