# Triage MIMIC - Emergency Department

This analysis relies on the emergency data from the MIMIC IV dataset (Refer to https://physionet.org/content/mimic-iv-ed/1.0/ for the original dataset.) 

First, you need to download the data from Physionet website, following the instructions on the website.

```
wget -r -N -c -np --user USERNAME --ask-password https://physionet.org/files/mimic-iv-ed/1.0/  
wget -r -N -c -np --user USERNAME --ask-password https://physionet.org/files/mimiciv/1.0/core/
```

This will result in a `physionet.org` folder in which the `ed` directory will contains all relevant data.

In [None]:
path = 'physionet.org/files/'

##### Extract data of interest

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import os

In [None]:
# Open data
demo = pd.read_csv(os.path.join(path, 'mimiciv/1.0/core/patients.csv.gz'), index_col = 0)
triage = pd.read_csv(os.path.join(path, 'mimic-iv-ed/1.0/ed/triage.csv.gz'), index_col = [0, 1])
ed = pd.read_csv(os.path.join(path, 'mimic-iv-ed/1.0/ed/edstays.csv.gz'), index_col = [0, 2], parse_dates = ['intime', 'outtime'])

In [None]:
# Remove unnecessary columns and datapoints with any missing data
triage = triage.drop(columns = 'chiefcomplaint')
triage = triage.dropna(0, 'any')
triage

In [None]:
# Nurse assignment
# Expertise and tiredness might play a role here and we assign the day of admission as proxies of these dimensions
triage['nurse'] = np.random.choice(np.arange(20), size = len(triage))

In [None]:
# Outcome - Y1
# Defined as admission to the hospital
triage['Y1'] = ed.hadm_id.isna()[triage.index]

In [None]:
# Outcome - Y2
# Defined as acuity
triage['Y2'] = (triage.join(demo).anchor_age > 65) | (triage['pain'] >= 7)

In [None]:
# Concept - Yc
# Yc is definied as the union of Y1 and Y2
triage['YC'] = triage['Y1'] | triage['Y2']

In [None]:
# Normalize data
triage.iloc[:, :-5] = StandardScaler().fit_transform(triage.iloc[:, :-5])
triage

### Verification

We study what proportion of the population have these characteristics.

In [None]:
# Nurse assignment
triage['nurse'].value_counts().sort_index() / len(triage)

In [None]:
# Outcome - Y1
triage['Y1'].mean()

In [None]:
# Outcome - Y2
triage['Y2'].mean()

In [None]:
# Concept - Yc
triage['YC'].mean()

In [None]:
# Intersection Y1 and Y2
(triage['Y1'] & triage['Y2']).sum() / min(triage['Y1'].sum(), triage['Y2'].sum())

In [None]:
# Intersection Y1 concept
(triage['Y1'] & triage['YC']).sum() / triage['YC'].sum()

In [None]:
# Intersection Y2 concept
(triage['Y2'] & triage['YC']).sum() / triage['YC'].sum()

------------

# Semi - synthetic labels for scenarios

We create semi synthetic labels using tree-based models to allow more control on the consistency scenarios

In [None]:
from sklearn.metrics import roc_auc_score, precision_score
from sklearn.tree import DecisionTreeClassifier
import numpy as np

#### Scenario 1 : Random errors


1. Build a tree to predict Y1
2. Build a tree to predict Y2
3. Update synthetic labels (Y1, Y2 and YC) to be the one predicted by trees
4. Create a tree to predict YC (aim for high auc)
5. Analyze each leaves and take the leaves with high precision for Y1 (> 70%)and low intersection with Y2 (< 30%)
6. Randomly draw a label for 100 % of the value in these nodes
7. Update D to be the updated labels

In [None]:
triages1 = triage.copy().drop(columns = ['acuity', 'pain'])
covariates = triages1.drop(columns = ['nurse', 'Y1', 'Y2', 'YC'])

In [None]:
# 1 - Model for Y1
model_y1 = DecisionTreeClassifier(max_depth = 15, random_state = 42)
model_y1.fit(covariates, triages1['Y1'])
synth_y1 = model_y1.predict_proba(covariates)[:, 1]
roc_auc_score(triages1['Y1'], synth_y1)

In [None]:
# 2 - Model for Y2
model_y2 = DecisionTreeClassifier(max_depth = 15, random_state = 42)
model_y2.fit(covariates, triages1['Y2'])
synth_y2 = model_y2.predict_proba(covariates)[:, 1]
roc_auc_score(triages1['Y2'], synth_y2)

In [None]:
# 3 - Update labels
triages1['Y1'] = synth_y1 > 0.5
triages1['Y2'] = synth_y2 > 0.5
triages1['YC'] = triages1['Y1'] | triages1['Y2']

In [None]:
# 4 - Model for D : Use a model for Yc and chance some of the leaved decision
model_yc = DecisionTreeClassifier(max_depth = 10, random_state = 42)
model_yc.fit(covariates, triages1['YC'])
synth_yc = model_yc.predict_proba(covariates)[:, 1]
roc_auc_score(triages1['YC'], synth_yc)

In [None]:
# 5 - Analyse leaves
final_leave_yc = model_yc.apply(covariates)
print('Tree contains {} leaves'.format(len(np.unique(final_leave_yc))))

## Select leaves
leaves_to_update = triages1.groupby(final_leave_yc).apply(lambda leaf: (leaf['Y1'].mean() > 0.7) & ((leaf['Y1'] & leaf['Y2']).mean() < 0.3))
leaves_to_update = leaves_to_update[leaves_to_update].index

In [None]:
# 6 - Randomly draw predictions
print("{} leaves selected covering: {:.2f} % of the population".format(len(leaves_to_update), 100*pd.Series(final_leave_yc).isin(leaves_to_update).mean()))
synth_yc_sc1 = synth_yc.copy()

# For 100 % draw a random label
noise = np.random.uniform(size = len(final_leave_yc)) > 0.
for leaf in leaves_to_update:
    selection = (final_leave_yc == leaf) & noise
    synth_yc_sc1[selection] = np.random.choice([0, 1], size = np.sum(selection))

In [None]:
# 7 - Update D
triages1['D'] = synth_yc_sc1 > 0.5
triages1.to_csv('triage_scenario_1.csv')

#### Scenario 2: Incorrect and homogeneous believes

Instead of 6., the whole population had a 75 % bias, meaning that all selected leaves are predicted not(Y1).

In [None]:
triages2 = triages1.copy()

In [None]:
# 6ter - Bias 50 %
synth_yc_sc2 = synth_yc.copy()

## Selection of 50%
np.random.seed(42)
biased = np.random.uniform(size = len(triages2)) > .25

# Reverse leaves
selection = biased & pd.Series(final_leave_yc, index = triages2.index).isin(leaves_to_update)
synth_yc_sc2[selection] = ~triages2.Y1[selection]

In [None]:
# 7ter - Update D
triages2['D'] = synth_yc_sc2 > 0.5
triages2.to_csv('triage_scenario_2.csv')

#### Scenario 3: Incorrect and heterogeneous believes

Instead of 6., each nurse has different level of biases $X_{nurse}$ between 70% and 100 %, meaning that the nurse is predicting not(Y1) in these leaves for $X_{nurse}$% of the patients.

In [None]:
triages3 = triages1.copy()

In [None]:
# 6bis - Draw different rate for each nurse and update accordingly
for lower in [0.3, 0.5, 0.7]:
    synth_yc_sc3 = synth_yc.copy()

    ## Create nurse-specific noise
    np.random.seed(42)
    proba_error = lower + 0.3 * np.random.uniform(size = len(np.unique(triages3.nurse)))
    noises = {nurse: np.random.uniform(size = len(triages3)) > proba_error[nurse] for nurse in np.unique(triages3.nurse)}

    # Draw random label
    selection = pd.Series(final_leave_yc, index = triages3.index).isin(leaves_to_update)
    for nurse in noises:
        selection_nurse = selection & noises[nurse] & (triages3.nurse == nurse)
        synth_yc_sc3[selection_nurse] = ~triages3.Y1[selection_nurse]

    # 7bis - Update D
    triages3['D'] = synth_yc_sc3 > 0.5
    triages3.to_csv('triage_scenario_3_{}.csv'.format(lower))

#### Scenario 4: One nurse biased against one group

Instead of 6., a nurse is biased against female patient

1. Create group
2. Biased nurse prediction by underestimating risk for all patients in the group for a given nurse

In [None]:
triages4 = triages1.copy()
triages4['D'] = synth_yc

In [None]:
# 1 - Create group
triages4['Group'] = (triages4.join(demo.gender).gender == 'F').astype(int)

In [None]:
# 2 - Bias nurse 0
selection_nurse = (triages4.Group == 1) & (triages4.nurse == 0)
triages4['D'][selection_nurse] = False

In [None]:
triages4.to_csv('triage_scenario_4.csv')

#### Scenario 4': One nurse biased against one group

Same as before but non random assignment (90 % female patients to this nurse).

In [None]:
triages4bis = triages4.copy()
triages4bis['D'] = synth_yc

In [None]:
# 2 - Bias nurse 0
np.random.seed(42)
selection_nurse = (triages4bis.Group == 1)
triages4bis.loc[selection_nurse[selection_nurse].sample(frac = 0.95, replace = False).index, 'nurse'] = 0

selection_nurse = selection_nurse & (triages4bis.nurse == 0)
triages4bis['D'][selection_nurse] = False

In [None]:
triages4bis.to_csv('triage_scenario_4bis.csv')

#### Scenario 5: Half of the nurses biased against one group

Same than before but biased half nurses

In [None]:
triages5 = triages4.copy()
triages5['D'] = synth_yc

In [None]:
# 2bis - Bias half nurses
selection_nurse = (triages5.Group == 1) & (triages5.nurse < 10)
triages5['D'][selection_nurse] = False

In [None]:
triages5.to_csv('triage_scenario_5.csv')

#### Scenario 6: All nurses biased against one group

Same than before but all nurses biased

In [None]:
triages6 = triages5.copy()
triages6['D'] = synth_yc

In [None]:
# 2bis - Bias all nurses
triages6['D'][triages6.Group == 1] = False

In [None]:
triages6.to_csv('triage_scenario_6.csv')

#### Scenario 7: All nurses 80% - biased against one group

Same than before but all nurses 80% biased

In [None]:
triages7 = triages6.copy()
triages7['D'] = synth_yc

In [None]:
# 2bis - Bias all nurses
group = triages6[triages6.Group == 1].sample(frac = 0.8, replace = False).index
triages7['D'].loc[group] = False

In [None]:
triages7.to_csv('triage_scenario_7.csv')