# Predicting emergency department visits anchored on clinic dates
---
## Background
Before, we built a model to predict emergency department (ED) visits anchored on treatment dates.

The problem with that is the primary physicians do not interact with their patients during their treatment sessions. They only meet during their clinic visits. That is the best time for the model to nudge the physician for an intervention. Thus, we now want to build a model to predict patient's risk of ED visits prior to clinic date instead of prior to treatment session.

---

In [1]:
%%capture
%cd ../../
%load_ext autoreload
%autoreload 2

In [2]:
import logging

import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime

from ml_common.util import get_excluded_numbers, load_pickle, save_pickle

from preduce.acu.eval import evaluate_valid, evaluate_test, predict
from preduce.acu.pipeline import PrepACUData
from preduce.acu.train import train_models, tune_params
from preduce.prepare.prep import anchor_features_to_clinic_dates
from preduce.summarize import feature_summary, get_label_distribution
from preduce.util import load_clinic_dates

pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 100)

logging.basicConfig(
    level=logging.INFO, 
    format='%(levelname)s:%(message)s', 
)

## Load clinic data

In [200]:
clinic = load_clinic_dates(data_dir='./data/processed')
treatment = pd.read_parquet('./data/interim/treatment.parquet.gzip')

Removing 123 visits that "occured before" 2006-01-05 00:00:00


In [201]:
# Filter the clinic dates
cols = ['treatment_date', 'regimen', 'line_of_therapy', 'intent', 'cycle_number']
df = pd.merge(clinic, treatment[['mrn']+cols], on='mrn', how='inner')
df = df.rename(columns={col: f'next_{col}' for col in cols})

# filter out clinic dates where the next treatment session does not occur within 5 days
mask = df['next_treatment_date'].between(df['clinic_date'], df['clinic_date'] + pd.Timedelta(days=5))
df = df[mask]

# filter out clinic dates where notes were uploaded after the next treatment session
mask = df['upload_date'] < df['next_treatment_date']
df = df[mask]

# remove duplicates from the merging
df = df.sort_values(by=['mrn', 'next_treatment_date'])
df = df.drop_duplicates(subset=['mrn', 'clinic_date'], keep='first')

In [202]:
# Calculate the median height, weight, and body surface area prior to clinic date
# TODO: put this in make-clinical-dataset (combine.py) or in ml-common (anchor.py or prep.py)
cols = ['height', 'weight', 'body_surface_area']
prior_median = pd.merge(df[['mrn', 'clinic_date']], treatment[['mrn', 'treatment_date']+cols], on='mrn', how='inner')
prior_median = prior_median.rename(columns={col: f'prior_median_{col}' for col in cols})
prior_median = prior_median.query('treatment_date < clinic_date').drop(columns=['treatment_date'])
prior_median = prior_median.groupby(['mrn', 'clinic_date']).median().reset_index()
# sns.displot(data=prior_median, x='prior_median_height', y='prior_median_weight')
df = pd.merge(df, prior_median, on=['mrn', 'clinic_date'], how='left')

In [203]:
df = df.sort_values(by=['mrn', 'clinic_date'])
df.to_csv('./data/processed/assessment_dates.csv', index=False)

## Anchor features to clinic visits

In [204]:
anchor_features_to_clinic_dates(script_path='../make-clinical-dataset/scripts')

## Load feature data

In [56]:
df = pd.read_parquet('./data/processed/clinic_centered_feature_dataset.parquet.gzip')
df['assessment_date'] = df['clinic_date']
emerg = pd.read_parquet('./data/interim/emergency_room_visit.parquet.gzip')

## Prepare Data

In [57]:
# Patient visit flow = first clinic visit (book treatment plan) -> second clinic visit (check up) -> start treatment
# For the clinic visits right before starting a new treatment, we are missing treatment information
# We can pull treatment information back (backfill) because its pre-booked at that point
# (where treatment starts within 5 days after clinic visit)
no_trts_prior = df['treatment_date'].isnull()
for col in ['regimen', 'line_of_therapy', 'intent', 'cycle_number']:
    df.loc[no_trts_prior, col] = df.pop(f'next_{col}')
df.loc[no_trts_prior, 'days_since_starting_treatment'] = 0

In [58]:
prep = PrepACUData()
df = prep.preprocess(df, emerg)

# For missing height, weight, body surface area, lets take the median value prior to assessment date
for col in ['height', 'weight', 'body_surface_area']:
    df[col] = df[col].fillna(df.pop(f'prior_median_{col}'))

# from ml_common.util import get_nmissing
# no_trts_prior = df['treatment_date'].isnull()
# pd.concat([get_nmissing(df.loc[no_trts_prior]), get_nmissing(df.loc[~no_trts_prior])], axis=1, keys=['First Visit', 'Subsequent Visits'])

INFO:Removing 0 patients and 5493 sessions not first of a given week
INFO:Removing 2982 patients and 12897 sessions before 2012-01-01 and after 2019-12-31
INFO:Removing the following features for drugs given less than 10 times: ['%_ideal_dose_given_DURVALUMAB', '%_ideal_dose_given_IPILIMUMAB', '%_ideal_dose_given_CAPECITABINE', '%_ideal_dose_given_ERLOTINIB']
INFO:Removing 2585 patients and 10848 sessions not from GI department
INFO:Dropping the following 56 features for missingness over 80%: ['creatinine', 'sodium', 'chloride', 'potassium', 'magnesium', 'red_cell_distribution_width', 'aspartate_aminotransferase', 'alanine_aminotransferase', 'alkaline_phosphatase', 'total_bilirubin', 'glucose', 'phosphate', 'lactate_dehydrogenase', 'albumin', 'eosinophil', 'bicarbonate', 'hemoglobin_change', 'mean_corpuscular_volume_change', 'mean_corpuscular_hemoglobin_concentration_change', 'mean_corpuscular_hemoglobin_change', 'mean_platelet_volume_change', 'lymphocyte_change', 'hematocrit_change', 

In [59]:
X, Y, metainfo = prep.prepare(df, event_name='ED_visit')
df = df.loc[X.index]
# clean up Y
for col in ['target_CEDIS_complaint', 'target_CTAS_score']:
    metainfo[col] = Y.pop(col)
Y.columns = Y.columns.str.replace('target_', '')

INFO:Removing 0 patients and 549 sessions that occured after 2018-02-01 in the development cohort
INFO:Removing 4 patients and 59 sessions in which patient had a target event in less than 2 days.
INFO:One-hot encoding training data
INFO:Reassigning the following 15 indicators with less than 6 patients as other: ['regimen_GI-CISPFU + TRAS(LOAD)', 'regimen_GI-CISPFU ANAL', 'regimen_GI-DOCEQ3W', 'regimen_GI-DOXO', 'regimen_GI-ELF', 'regimen_GI-EOX', 'regimen_GI-FOLFNALIRI', 'regimen_GI-FOLFNALIRI (COMP)', 'regimen_GI-FU/FA/CISP BILIARY', 'regimen_GI-FUFA C3 (GASTRIC)', 'regimen_GI-FUFA-5 DAYS', 'regimen_GI-GEM D1,8 + CAPECIT', 'regimen_GI-GEMFU (BILIARY)', 'regimen_GI-IRINO 4-WEEKLY', 'regimen_GI-PACLI WEEKLY']
INFO:Reassigning the following 0 indicators with less than 6 patients as other: []
INFO:One-hot encoding testing data
INFO:Reassigning the following regimen indicator columns that did not exist in train set as other:
regimen_GI-CISPFU + TRAS(LOAD)     2
regimen_GI-FOLFIRI+PANITUMUM

In [60]:
train_mask, test_mask = metainfo['split'] == 'Train', metainfo['split'] == 'Test'
X_train, X_test = X[train_mask], X[test_mask]
Y_train, Y_test = Y[train_mask], Y[test_mask]
metainfo_train, metainfo_test = metainfo[train_mask], metainfo[test_mask]

In [61]:
# Save the data prep for silent deployment
# So we transform new incoming data using the original data preparer
# save_pickle(prep.scaler, './result', 'scaler_ED')
# save_pickle(prep.imp.imputer, './result', 'imputer_ED')
# save_pickle(prep.clip_thresh, './result', 'clip_thresh_ED')
# save_pickle(prep.ohe.final_columns, './result', 'encoded_cols_ED')

# X.to_csv('./data/debug/to_muammar/X.csv', index=False)
# Y.to_csv('./data/debug/to_muammar/Y.csv', index=False)
# metainfo.to_csv('./data/debug/to_muammar/metainfo.csv', index=False)
# df.loc[X.index].to_csv('./data/debug/to_muammar/orig.csv', index=False)

## Describe Data

In [62]:
count = pd.DataFrame({
    'Number of sessions': metainfo.groupby('split').apply(len, include_groups=False), 
    'Number of patients': metainfo.groupby('split')['mrn'].nunique()}
).T
count['Total'] = count.sum(axis=1)
print(f'\n{count.to_string()}')


split               Test  Train  Total
Number of sessions  1082  10152  11234
Number of patients   389   1927   2316


In [63]:
no_trts_prior = df['treatment_date'].isnull()
pd.concat([
    get_label_distribution(Y[no_trts_prior], metainfo[no_trts_prior], with_respect_to='sessions'),
    get_label_distribution(Y[~no_trts_prior], metainfo[~no_trts_prior], with_respect_to='sessions'),
    get_label_distribution(Y, metainfo, with_respect_to='sessions')
], keys=['First Visit', 'Subsequent Visit', 'All'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Total,Total,Test,Test,Train,Train
Unnamed: 0_level_1,ED_visit,False,True,False,True,False,True
First Visit,ED_visit,1597,309,151,39,1446,270
Subsequent Visit,ED_visit,8425,903,784,108,7641,795
All,ED_visit,10022,1212,935,147,9087,1065


In [64]:
pd.concat([
    get_label_distribution(Y[no_trts_prior], metainfo[no_trts_prior], with_respect_to='patients'),
    get_label_distribution(Y[~no_trts_prior], metainfo[~no_trts_prior], with_respect_to='patients'),
    get_label_distribution(Y, metainfo, with_respect_to='patients')
], keys=['First Visit', 'Subsequent Visit', 'All'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Total,Total,Test,Test,Train,Train
Unnamed: 0_level_1,Unnamed: 1_level_1,1,0,1,0,1,0
First Visit,ED_visit,287,1124,37,126,250,998
Subsequent Visit,ED_visit,527,1584,71,267,456,1317
All,ED_visit,666,1650,96,293,570,1357


In [65]:
# Feature Characteristics
x = prep.ohe.encode(df.loc[X_train.index].copy(), verbose=False) # get original (non-normalized, non-imputed) data one-hot encoded
x = x[[col for col in x.columns if not (col in metainfo.columns or col.startswith('target'))]]
feature_summary(x, save_path='result/tables/feature_summary_ED_clinic_anchored.csv').sample(10, random_state=42)

Unnamed: 0,Features,Group,Mean (SD),Missingness (%)
59,Regimen GI-CISPCAPE+TRAS(LOAD),Treatment,0.004 (0.064),0.0
52,ESAS Drowsiness Score Change,Symptoms,0.058 (1.997),56.3
91,Regimen GI-XELOX,Treatment,0.005 (0.071),0.0
5,Female (yes/no),Demographic,0.438 (0.496),0.0
78,"Regimen GI-GEM D1,8",Treatment,0.003 (0.054),0.0
80,Regimen GI-GEM+ABRAXANE,Treatment,0.048 (0.214),0.0
61,Regimen GI-CISPFU + TRAS(MAIN),Treatment,0.002 (0.050),0.0
55,ESAS Dyspnea Score Change,Symptoms,0.005 (1.544),56.5
13,"Topography ICD-0-3 C22, Liver and intrahepatic...",Cancer,0.045 (0.208),0.0
42,Days Since Previous ED Visit,Acute care use,1186.708 (812.315),0.0


## Train Models

In [66]:
# LGBM does not like non alphanumeric characters (except for _)
for char in ['(', ')', '+', '-', '/', ',']: 
    X_train.columns = X_train.columns.str.replace(char, '_')
    X_test.columns = X_test.columns.str.replace(char, '_')

In [73]:
%%capture
# Hyperparameter tuning
# TODO: try greater kappa for greater exploration
algs = ['LASSO', 'RF', 'Ridge', 'XGB', 'LGBM']
best_params = {}
for alg in algs:
    best_params[alg] = tune_params(alg, X_train, Y_train['ED_visit'], metainfo_train)
save_pickle(best_params, './models', 'best_params_clinic_anchored')
save_pickle(best_params, './models', f'best_params_clinic_anchored-{datetime.now()}')

In [78]:
best_params = load_pickle('./models', 'best_params_clinic_anchored')
models = train_models(X_train, Y_train, metainfo_train, best_params)

## Model Selection
Select final model based on the average performance across the validation folds

In [79]:
evaluate_valid(models, X_train, Y_train, metainfo_train)

Unnamed: 0_level_0,Ridge,Ridge,LASSO,LASSO,XGB,XGB,LGBM,LGBM,RF,RF
Unnamed: 0_level_1,AUPRC,AUROC,AUPRC,AUROC,AUPRC,AUROC,AUPRC,AUROC,AUPRC,AUROC
ED_visit,0.248335,0.750925,0.24326,0.744364,0.335396,0.798783,0.181312,0.703825,0.306394,0.757898


## Evaluate Model

In [80]:
pd.concat([evaluate_test(model, X_test, Y_test) for alg, model in models.items()], keys=models.keys()).T

Unnamed: 0_level_0,Ridge,Ridge,LASSO,LASSO,XGB,XGB,LGBM,LGBM,RF,RF
Unnamed: 0_level_1,AUPRC,AUROC,AUPRC,AUROC,AUPRC,AUROC,AUPRC,AUROC,AUPRC,AUROC
ED_visit,0.227827,0.655739,0.227753,0.662432,0.245493,0.6806,0.198982,0.642304,0.252783,0.678075


In [81]:
model = models['XGB']
mask = metainfo_test['treatment_date'].isnull()
pd.concat([
    evaluate_test(model, X_test[mask], Y_test[mask]),
    evaluate_test(model, X_test[~mask], Y_test[~mask])
], keys=['First Visit', 'Subsequent Visits']).T

Unnamed: 0_level_0,First Visit,First Visit,Subsequent Visits,Subsequent Visits
Unnamed: 0_level_1,AUPRC,AUROC,AUPRC,AUROC
ED_visit,0.312476,0.649431,0.227867,0.676729


In [82]:
# compute threshold that achieves 10% alarm rate
pred = predict(model['ED_visit'], X_test)
for pred_threshold in np.arange(0, 1.0, 0.001):
    alarm_rate = np.mean(pred > pred_threshold)
    if np.isclose(alarm_rate, 0.1, atol=0.005):
        print(f'Prediction threshold: {pred_threshold}\nAlarm Rate: {alarm_rate}')
        break

Prediction threshold: 0.219
Alarm Rate: 0.10166358595194085


In [72]:
save_pickle(model['ED_visit'], './models', 'XGB_ED_visit_clinic_anchored')

# Scratch Notes

## Include same-day features

In [None]:
evaluate_valid(models, X_train, Y_train, metainfo_train)

Unnamed: 0_level_0,Ridge,Ridge,LASSO,LASSO,XGB,XGB,LGBM,LGBM,RF,RF
Unnamed: 0_level_1,AUPRC,AUROC,AUPRC,AUROC,AUPRC,AUROC,AUPRC,AUROC,AUPRC,AUROC
ED_visit,0.269367,0.781758,0.272045,0.783733,0.45703,0.865704,0.278372,0.767941,0.36844,0.8111


In [None]:
pd.concat([evaluate_test(model, X_test, Y_test) for alg, model in models.items()], keys=models.keys()).T

Unnamed: 0_level_0,Ridge,Ridge,LASSO,LASSO,XGB,XGB,LGBM,LGBM,RF,RF
Unnamed: 0_level_1,AUPRC,AUROC,AUPRC,AUROC,AUPRC,AUROC,AUPRC,AUROC,AUPRC,AUROC
ED_visit,0.237153,0.67575,0.231971,0.679841,0.237409,0.6851,0.219119,0.664609,0.252247,0.714822


In [None]:
# compute threshold that achieves 10% alarm rate
pred = predict(models['XGB']['ED_visit'], X_test)
for pred_threshold in np.arange(0, 1.0, 0.001):
    alarm_rate = np.mean(pred > pred_threshold)
    if np.isclose(alarm_rate, 0.1, atol=0.004):
        print(f'Prediction threshold: {pred_threshold}\nAlarm Rate: {alarm_rate}')
        break

Prediction threshold: 0.222
Alarm Rate: 0.10344827586206896


## Outcome-level sensitivity

In [53]:
def outcome_level_sensitivity(df, lookahead_window: int = 30):
    """Get the proportion of true outcomes where at least one alarm preceded the event

    E.g. if ED visit happens on Jan 20, our lookback window is 30 days, and assessments 
        happens on Jan 1 and Jan 14, then the outcome-level true positive is if 
        either Jan 1 or Jan 14 trigger a warning, and false negative if neither do
    """
    result = []
    for (mrn, event_date), group in df.groupby(['mrn', 'event_date']):

        # ensure assessment date and event date is within X days of each other
        diff = (group['event_date'] - group['assessment_date']).dt.days
        assert all(diff.between(0, lookahead_window))

        result.append(any(group['pred']))

    return sum(result) / len(result) # tp / (tp + fn)

event_df = pd.DataFrame()
event_df[['mrn', 'assessment_date']] = df[['mrn', 'clinic_date']]
event_df['pred'] = np.random.choice([0, 1], size=len(event_df))
event_df['event_date'] = metainfo['target_ED_visit_date']
outcome_level_sensitivity(event_df)

0.6116838487972509