In [None]:
import sys
sys.path.append('../')

from model import *
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# MIMIC - Triage

This notebook runs the analysis on the MIMIC emergency data by leveraging experts' agreement
1. Explore a model build on data ignoring experts 
2. Compute agreement between experts using influence function
3. Retrain the model on the set of labels for which experts strongly agree

The current analysis uses multi layer perceptrons in a single train / test split.

In [None]:
data_set = "../data/triage_scenario_1.csv" # Data file - Choose the scenario of interest
selective_labels = False # Is it a case of selective labels (only observe the outcome for patients filtered by nurse: D == 1)

### Data

Reopen the data created with the notebook in `data/`

To use with your data, you will need to change the following line to open a file with:
- `X` covariates
- `H` associated (human) experts 
- `D` their decision for each case
- `Y` observed outcome
- `Yc` a concept 

In [None]:
triage = pd.read_csv(data_set, index_col = [0, 1])
covariates, target, nurses = triage.drop(columns = ['D', 'Y1', 'Y2', 'YC', 'nurse']), triage[['D', 'Y1', 'Y2', 'YC']], triage['nurse']

Split data in a 80% train, 20% test

In [None]:
cov_train, cov_test, tar_train, tar_test, nur_train, nur_test = train_test_split(covariates, target, nurses, test_size = 0.2, random_state = 42)

### Modelling

In [None]:
# Model's characteristics
params = {'layers': [[50]]} # If = [[]] equivalent to a simple logistic regression

In [None]:
def evaluate(predictions, p = 0.3):
    # Overall Performances
    print('Performance')
    for tar in ['Y1', 'Y2', 'YC', 'D']:
        print('{} - AUC: {:.3f}'.format(tar, roc_auc_score(tar_test[tar], predictions)))

        try:
            predictions = pd.Series(predictions, index = tar_test.index)
            bot = predictions.nsmallest(n = int(p * len(predictions)), keep = 'all').index
            female = covariates.loc[predictions.index].Group == 1
            bot_female = bot.intersection(female[female].index)
            
            print('{} - Female TNR: {:.3f}'.format(tar, 1 - tar_test[tar].loc[bot_female].mean()))
            print('{} - Female PNR: {:.3f}'.format(tar, len(bot_female) / female.sum()))
            print('\n')
        except:
            pass

##### 1. Train on decision

This model models the nurse decision based on covariates

In [None]:
f_D = BinaryMLP(**params)
f_D = f_D.fit(cov_train, tar_train['D'], nur_train, platt_calibration = True)
predictions_d = f_D.predict(cov_test)
evaluate(predictions_d) 

##### 2. Agreement computation 

Measure of agreeability are estimated in a cross validation fashion on the train set.

In [None]:
# Fold evaluation of influences
folds, predictions, influence = influence_cv(BinaryMLP, cov_train, tar_train['D'], nur_train, params = params, l1_penalties = [0.001, 0.01, 0.1, 1])

In [None]:
# Compute metrics agreeability
center_metric, opposing_metric = compute_agreeability(influence, predictions)

In [None]:
# Analyze confident points
delta = 0.05 # Control which point to consider from a confience point of view

In [None]:
high_conf = (predictions > (1 - delta)) | (predictions < delta)

In [None]:
# Amalgation parameters
pi_1 = 6 # Control criterion on centre mass metric
pi_2 = 0.8 # Control criterion on opposing metric
pi_3 = 0.002 # On flatness
tau = 1.0  # Balance between observed and expert labels

In [None]:
# Apply criteria on amalgamation
flat_influence = (np.abs(influence) > pi_3).sum(0) == 0
high_agr = (((center_metric > pi_1) & (opposing_metric > pi_2)) | flat_influence) & high_conf
high_agr_correct = ((predictions - tar_train['D']).abs() < delta) & high_agr

In [None]:
# Create amalgamated labels
tar_train['Ya'] = tar_train['Y1'].astype(int)
tar_train.loc[high_agr_correct, 'Ya'] = (1 - tau) * tar_train['Y1'][high_agr_correct] \
                                            + tau * tar_train['D'][high_agr_correct]

In [None]:
index_amalg = ((tar_train['D'] == 1) | high_agr_correct) if selective_labels else tar_train['D'].isin([0, 1])
print("Use: {:.2f} % of data".format(100 * index_amalg.mean()))

##### 3. Updated model

In [None]:
f_A = BinaryMLP(**params)
f_A = f_A.fit(cov_train[index_amalg], tar_train[index_amalg]['Ya'], nur_train[index_amalg])
predictions_amal = f_A.predict(cov_test)
evaluate(predictions_amal)

##### 4. Train on observed data

In [None]:
index_observed = (tar_train['D'] == 1) if selective_labels else tar_train['D'].isin([0, 1])

In [None]:
f_Y = BinaryMLP(**params)
f_Y = f_Y.fit(cov_train[index_observed], tar_train['Y1'][index_observed], nur_train[index_observed])
predictions_y = f_Y.predict(cov_test)
evaluate(predictions_y)

--------

##### 5. Hybrid alternative

- Leverage human model in the amalgamation set
- Leverage outcome model on non amalgamation set

Models need to be retrain on their respective subsets and calibrated to ensure to mix

In [None]:
predictions = predictions_d.copy()

In [None]:
# Compute which test points are part of A for test set
predictions_test, influence_test = influence_estimate(BinaryMLP, cov_train, tar_train['D'], nur_train, cov_test, params = params, l1_penalties = [0.001, 0.01, 0.1, 1])
center_metric, opposing_metric = compute_agreeability(influence_test, predictions_test)
flat_influence_test = (np.abs(influence_test) > pi_3).sum(0) == 0
high_conf_test = (predictions_test > (1 - delta)) | (predictions_test < delta)
high_agr_test = (((center_metric > pi_1) & (opposing_metric > pi_2)) | flat_influence_test) & high_conf_test
high_agr_correct_test = ((predictions_test - tar_test['D']).abs() < delta) & high_agr_test

In [None]:
# Retrain a model on non almagamation only and calibrate: Rely on observed
f_hyb = BinaryMLP(**params)
f_hyb = f_hyb.fit(cov_train[index_observed], tar_train['Y1'][index_observed], nur_train[index_observed], platt_calibration = True)
predictions[~high_agr_correct_test] = f_hyb.predict(cov_test.loc[~high_agr_correct_test])
evaluate(predictions)

##### 6. Alternative consensus - Ensemble

Instead of influence based, approximate the consensus by an ensemble model: each model is trained on one expert, then consistency is estimated by averaging the decision made across experts.

In [None]:
decisions = ensemble_agreement_cv(BinaryMLP, cov_train, tar_train['D'], nur_train, params = params)

In [None]:
predictions = (decisions > 0.5).mean(0) # Take the average of the binarized decisions 
high_conf = (predictions > (1 - delta)) | (predictions < delta)
high_agr_correct = ((predictions - tar_train['D']).abs() < delta) & high_conf

In [None]:
# From the differente experts decisions, estimate agreement
tar_train['Ya_ens'] = tar_train['Y1'].astype(int)
tar_train.loc[high_agr_correct, 'Ya_ens'] = (1 - tau) * tar_train['Y1'][high_agr_correct] \
                                            + tau * tar_train['D'][high_agr_correct]
index_amalg = ((tar_train['D'] == 1) | high_agr_correct) if selective_labels else tar_train['D'].isin([0, 1])

In [None]:
f_A_ens = BinaryMLP(**params)
f_A_ens = f_A_ens.fit(cov_train[index_amalg], tar_train[index_amalg]['Ya_ens'], nur_train[index_amalg])
predictions_amal_ens = f_A_ens.predict(cov_test)
evaluate(predictions_amal_ens)

----------

##### 7. Defer approach

Jointly learn when to defer to human and when to use the model prediction

In [None]:
from model.defer import DeferMLP

In [None]:
f_defer = DeferMLP(**params)
f_defer = f_defer.fit(cov_train[index_observed], tar_train['Y1'][index_observed], tar_train['D'][index_observed])
predictions_defer = f_defer.predict(cov_test, tar_test['D'])
evaluate(predictions_defer)

##### 8. Ensemble 

Train a separate model for $Y$ and $D$ and average their predictions.

In [None]:
evaluate((predictions_d + predictions_y) / 2)

##### 9. Weak supervision

Average the label of $Y$ and $D$ (0.5 if disagree) and train a model on theselabels.

In [None]:
weak_labels = (tar_train['D'] + tar_train['Y1']).fillna(tar_train['D']) / 2 # For weak supervision, use observed decisions when no Y

In [None]:
f_weak = BinaryMLP(**params)
f_weak = f_weak.fit(cov_train, weak_labels, nur_train)
predictions_weak = f_weak.predict(cov_test)
evaluate(predictions_weak) 

##### 10. Noisy labels learning

Use confident learning to discard noisy labels.

In [None]:
import cleanlab
from sklearn.neural_network import MLPClassifier

In [None]:
# Estimate the set of points with noisy labels
f_robust = cleanlab.classification.CleanLearning(MLPClassifier(50))
label_issues = f_robust.find_label_issues(cov_train, tar_train['D'].astype(int))

# Remove data with labels issue
selection = ~label_issues.is_label_issue.values

In [None]:
# For fair comparison train the same model than other methods
f_robust = BinaryMLP(**params)
f_robust.fit(cov_train.iloc[selection], tar_train['D'].iloc[selection], nur_train)
predictions_robust = f_robust.predict(cov_test)
evaluate(predictions_robust) 