# Predicting emergency department visits anchored on clinic dates
---
## Background
Before, we built a model to predict emergency department (ED) visits anchored on treatment dates.

The problem with that is the primary physicians do not interact with their patients during their treatment sessions. They only meet during their clinic visits. That is the best time for the model to nudge the physician for an intervention. Thus, we now want to build a model to predict patient's risk of ED visits prior to clinic date instead of prior to treatment session.

---

In [1]:
%%capture
%cd ../../
%load_ext autoreload
%autoreload 2

In [2]:
import logging

import numpy as np
import pandas as pd
from datetime import datetime

from ml_common.util import load_pickle, save_pickle

from preduce.acu.eval import evaluate_valid, evaluate_test, predict
from preduce.acu.pipeline import PrepACUData
from preduce.acu.train import train_models, tune_params
from preduce.prepare.clinic import get_clinic_visit_data, process_clinic_visits_prior_to_treatment
from preduce.prepare.prep import anchor_features_to_assessment_dates
from preduce.summarize import feature_summary, get_label_distribution

pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 100)

logging.basicConfig(
    level=logging.INFO, 
    format='%(levelname)s:%(message)s', 
)

## Preprocess clinic visits

In [3]:
clinic_visit = get_clinic_visit_data(data_dir='./data/interim')
clinic_visit['assessment_date'] = clinic_visit['clinic_date']
clinic_visit.to_csv('./data/processed/assessment_dates.csv', index=False)

Removing 123 visits that "occured before" 2006-01-05 00:00:00


## Anchor features to clinic visits

In [4]:
anchor_features_to_assessment_dates(
    script_path='../make-clinical-dataset/scripts', 
    output_filename='clinic_centered_feature_dataset'
)

## Load feature data

In [52]:
df = pd.read_parquet('./data/processed/clinic_centered_feature_dataset.parquet.gzip')
emerg = pd.read_parquet('./data/interim/emergency_room_visit.parquet.gzip')

## Prepare Data

In [6]:
df = process_clinic_visits_prior_to_treatment(df)
prep = PrepACUData()
df = prep.preprocess(df, emerg)
X, Y, metainfo = prep.prepare(df, event_name='ED_visit')
df = df.loc[X.index]

INFO:Removing 0 patients and 5493 sessions not first of a given week
INFO:Removing 2982 patients and 12897 sessions before 2012-01-01 and after 2019-12-31
INFO:Removing the following features for drugs given less than 10 times: ['%_ideal_dose_given_DURVALUMAB', '%_ideal_dose_given_IPILIMUMAB', '%_ideal_dose_given_CAPECITABINE', '%_ideal_dose_given_ERLOTINIB']
INFO:Removing 2585 patients and 10848 sessions not from GI department
INFO:Dropping the following 56 features for missingness over 80%: ['creatinine', 'sodium', 'chloride', 'potassium', 'magnesium', 'red_cell_distribution_width', 'aspartate_aminotransferase', 'alanine_aminotransferase', 'alkaline_phosphatase', 'total_bilirubin', 'glucose', 'phosphate', 'lactate_dehydrogenase', 'albumin', 'eosinophil', 'bicarbonate', 'mean_corpuscular_volume_change', 'mean_corpuscular_hemoglobin_concentration_change', 'mean_corpuscular_hemoglobin_change', 'hemoglobin_change', 'hematocrit_change', 'mean_platelet_volume_change', 'lymphocyte_change', 

In [7]:
train_mask, test_mask = metainfo['split'] == 'Train', metainfo['split'] == 'Test'
X_train, X_test = X[train_mask], X[test_mask]
Y_train, Y_test = Y[train_mask], Y[test_mask]
metainfo_train, metainfo_test = metainfo[train_mask], metainfo[test_mask]

In [55]:
# Save the data prep for silent deployment
# So we transform new incoming data using the original data preparer
save_pickle(prep, './result', 'prep_ED_visit_clinic_anchored')

## Describe Data

In [9]:
count = pd.DataFrame({
    'Number of sessions': metainfo.groupby('split').apply(len, include_groups=False), 
    'Number of patients': metainfo.groupby('split')['mrn'].nunique()}
).T
count['Total'] = count.sum(axis=1)
print(f'\n{count.to_string()}')


split               Test  Train  Total
Number of sessions  1082  10152  11234
Number of patients   389   1927   2316


In [10]:
no_trts_prior = df['treatment_date'].isnull()
pd.concat([
    get_label_distribution(Y[no_trts_prior], metainfo[no_trts_prior], with_respect_to='sessions'),
    get_label_distribution(Y[~no_trts_prior], metainfo[~no_trts_prior], with_respect_to='sessions'),
    get_label_distribution(Y, metainfo, with_respect_to='sessions')
], keys=['First Visit', 'Subsequent Visit', 'All'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Total,Total,Test,Test,Train,Train
Unnamed: 0_level_1,ED_visit,False,True,False,True,False,True
First Visit,ED_visit,1597,309,151,39,1446,270
Subsequent Visit,ED_visit,8425,903,784,108,7641,795
All,ED_visit,10022,1212,935,147,9087,1065


In [11]:
pd.concat([
    get_label_distribution(Y[no_trts_prior], metainfo[no_trts_prior], with_respect_to='patients'),
    get_label_distribution(Y[~no_trts_prior], metainfo[~no_trts_prior], with_respect_to='patients'),
    get_label_distribution(Y, metainfo, with_respect_to='patients')
], keys=['First Visit', 'Subsequent Visit', 'All'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Total,Total,Test,Test,Train,Train
Unnamed: 0_level_1,Unnamed: 1_level_1,1,0,1,0,1,0
First Visit,ED_visit,287,1124,37,126,250,998
Subsequent Visit,ED_visit,527,1584,71,267,456,1317
All,ED_visit,666,1650,96,293,570,1357


In [12]:
# Feature Characteristics
x = prep.ohe.encode(df.loc[X_train.index].copy(), verbose=False) # get original (non-normalized, non-imputed) data one-hot encoded
x = x[[col for col in x.columns if not (col in metainfo.columns or col.startswith('target'))]]
feature_summary(x, save_path='result/tables/feature_summary_ED_clinic_anchored.csv').sample(10, random_state=42)

Unnamed: 0,Features,Group,Mean (SD),Missingness (%)
59,Regimen GI-CISPCAPE+TRAS(LOAD),Treatment,0.004 (0.064),0.0
52,ESAS Drowsiness Score Change,Symptoms,0.058 (1.997),56.3
91,Regimen GI-XELOX,Treatment,0.005 (0.071),0.0
5,Female (yes/no),Demographic,0.438 (0.496),0.0
78,"Regimen GI-GEM D1,8",Treatment,0.003 (0.054),0.0
80,Regimen GI-GEM+ABRAXANE,Treatment,0.048 (0.214),0.0
61,Regimen GI-CISPFU + TRAS(MAIN),Treatment,0.002 (0.050),0.0
55,ESAS Dyspnea Score Change,Symptoms,0.005 (1.544),56.5
13,"Topography ICD-0-3 C22, Liver and intrahepatic...",Cancer,0.045 (0.208),0.0
42,Days Since Previous ED Visit,Acute care use,1186.708 (812.315),0.0


## Train Models

In [13]:
# LGBM does not like non alphanumeric characters (except for _)
for char in ['(', ')', '+', '-', '/', ',']: 
    X_train.columns = X_train.columns.str.replace(char, '_')
    X_test.columns = X_test.columns.str.replace(char, '_')

In [81]:
%%capture
# Hyperparameter tuning
# TODO: try greater kappa for greater exploration
algs = ['LASSO', 'RF', 'Ridge', 'XGB', 'LGBM']
best_params = {}
for alg in algs:
    best_params[alg] = tune_params(alg, X_train, Y_train['ED_visit'], metainfo_train)
save_pickle(best_params, './models', 'best_params_clinic_anchored')
save_pickle(best_params, './models', f'best_params_clinic_anchored-{datetime.now()}')

In [98]:
best_params = load_pickle('./models', 'best_params_clinic_anchored')
models = train_models(X_train, Y_train, metainfo_train, best_params)

## Model Selection
Select final model based on the average performance across the validation folds

In [99]:
evaluate_valid(models, X_train, Y_train, metainfo_train)

Unnamed: 0_level_0,Ridge,Ridge,LASSO,LASSO,XGB,XGB,LGBM,LGBM,RF,RF
Unnamed: 0_level_1,AUPRC,AUROC,AUPRC,AUROC,AUPRC,AUROC,AUPRC,AUROC,AUPRC,AUROC
ED_visit,0.273578,0.772846,0.270908,0.774217,0.558861,0.919677,0.228049,0.71533,0.346846,0.793823


## Evaluate Model

In [100]:
pd.concat([evaluate_test(model, X_test, Y_test) for alg, model in models.items()], keys=models.keys()).T

Unnamed: 0_level_0,Ridge,Ridge,LASSO,LASSO,XGB,XGB,LGBM,LGBM,RF,RF
Unnamed: 0_level_1,AUPRC,AUROC,AUPRC,AUROC,AUPRC,AUROC,AUPRC,AUROC,AUPRC,AUROC
ED_visit,0.254211,0.66171,0.269777,0.654367,0.239631,0.677871,0.220564,0.653517,0.249247,0.675706


In [101]:
model = models['XGB']
mask = metainfo_test['treatment_date'].isnull()
pd.concat([
    evaluate_test(model, X_test[mask], Y_test[mask]),
    evaluate_test(model, X_test[~mask], Y_test[~mask])
], keys=['First Visit', 'Subsequent Visits']).T

Unnamed: 0_level_0,First Visit,First Visit,Subsequent Visits,Subsequent Visits
Unnamed: 0_level_1,AUPRC,AUROC,AUPRC,AUROC
ED_visit,0.304225,0.656868,0.217102,0.669823


In [102]:
# compute threshold that achieves 10% alarm rate
pred = predict(model['ED_visit'], X_test)
for pred_threshold in np.arange(0, 1.0, 0.001):
    alarm_rate = np.mean(pred > pred_threshold)
    if np.isclose(alarm_rate, 0.1, atol=0.005):
        print(f'Prediction threshold: {pred_threshold}\nAlarm Rate: {alarm_rate}')
        break

Prediction threshold: 0.198
Alarm Rate: 0.10382513661202186


In [103]:
save_pickle(model['ED_visit'], './models', 'XGB_ED_visit_clinic_anchored')